Load all relevant packages:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)
library(stringr)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
library(urbnmapr)
library(devtools)
## Loading required package: usethis
library(readxl)
library(spdep)
## Loading required package: spData
## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`
## Loading required package: sf
## Linking to GEOS 3.9.3, GDAL 3.5.2, PROJ 8.2.1; sf_use_s2() is TRUE
library(sp)
library(huge)
library(INLA)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loading required package: parallel
## This is INLA_22.12.16 built 2022-12-23 13:24:10 UTC.
## - See www.r-inla.org/contact-us for how to get help.
library(HMMpa)
library(invgamma)
library(brinla)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(patchwork)
library(jsonlite)
##
## Attaching package: 'jsonlite'
##
## The following object is masked from 'package:purrr':
##
## flatten
library(geosphere)
library(urbnmapr)
library(RAQSAPI)
## Use the function
## RAQSAPI::aqs_credentials(username, key)
## before using other RAQSAPI functions
## See ?RAQSAPI::aqs_credentials for more information
library(con2aqi)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(reshape2)
library(corrplot)
## corrplot 0.92 loaded
library(superheat)
library(shapes)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
Loading and (quickly cleaning) all necessary datasets:
###SoA data
soa.data = read_xlsx("SoA.data.1019.xlsx")
###county_flips are unique identifier for counties
soa.data$county_fips = as.character(soa.data$county_fips) ##change it to character
#IMPORTANT
# This shape file contains the coordinates for county boundaries
##counties is from urbanmap
CA.counties = urbnmapr::counties %>% filter(state_abbv == "CA")
###IF WE WANT TO BOIL DOWN TIME SERIES AND KEEP ALL DATA, SWITCH to CA_newdata below
soa_joint <- left_join(CA.counties, soa.data, by = "county_fips")
## Warning in left_join(CA.counties, soa.data, by = "county_fips"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#Use with soa.data (full data)
CA_data = soa_joint %>% select(long, lat, county_name.y, Year, Score, Total_Pop,
EDUC_Lessthan9, EDUC_college, White_Collar,
Unemployment_Rate, Adj_HH_income, Income_Disparity,
Individuals_Below_Poverty, Median_Home_Value,
Median_Gross_Rent, Housing_No_Telephone,
Housing_Incomplete_Plumbing)
colnames(CA_data)[3] = "County"
CA_newdata = soa.data[1:58,]
CA_newdata = CA_newdata[,-c(4,7,8)]
###Cal-ViDa data
mortality = read.csv("respmortality1419.csv") #data from 2014-2019 bc we want to avoid COVID pandemic era
mortality = filter(mortality,Cause_of_Death %in% c("Chronic lower respiratory diseases","Influenza and pneumonia"))
mortality = mortality[,-c(1,4,9)]
Population = rep(100000,nrow(mortality))
mortality = cbind(mortality,Population)
Quick descriptions of SoA data variables:
Score: social deprivation index (SDI) score calculated from the following 11 subindices EDUC_Lessthan9: % of population older than 24 with less than 9 years of education EDUC_college: % of population older than 24 with at least four years of college education White_Collar: % of population older than 15 employed in a white collar occupation Unemployment_Rate: unemployment rate for population older than 15 Adj_HH_income: median household income adjusted for local housing costs Income_Disparity: an income disparity ratio Individuals_Below_Poverty: % of population below the federal poverty line Median_Home_Value: median home value for owned, occupied units Median_Gross_Rent: median gross rent for rented units Housing_No_Telephone: % of households without a telephone Housing_Incomplete_Plumbing: % of households with incomplete plumbing
#Initializing map and station locations
ca_map <- map_data("county", region = "california")
#Match population dataset with ca_map
#2010-2019 population data for CA
USpops = read.csv("CA_census_pops1019.csv")
CApops = USpops %>% filter(STNAME == "California") %>% select(CTYNAME,POPESTIMATE2019)
CApops = CApops[-1,]
CApops$CTYNAME = unique(ca_map$subregion)
colnames(CApops) = c("subregion","pop")
merged_data <- merge(ca_map, CApops, by = "subregion", all.x = TRUE)
#Plot
gg_pop <- ggplot() +
geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = pop),
color = "black") +
coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
theme_void() +
labs(title = "Heatmap of County Populations for 2019",fill = expression("Population")) +
scale_fill_gradient(low = "yellow", high = "red")
print(gg_pop)
California state with county labels for reference:
The code below structures the dataframe to be fed into the spatial data frame (SPDF) object. Dimensions are 58 rows by 10 columns (each column is its own year)
###Setting up SPDF for CA counties
CA_sf = st_read(getwd(),"CA_Counties_TIGER2016")
## Reading layer `CA_Counties_TIGER2016' from data source
## `C:\Users\jeffr\Desktop\Spatiotemporal + Causal Inference\Wildfire Paper 1 Code'
## using driver `ESRI Shapefile'
## Simple feature collection with 58 features and 17 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -13857270 ymin: 3832931 xmax: -12705030 ymax: 5162404
## Projected CRS: WGS 84 / Pseudo-Mercator
CA_spdf = as_Spatial(CA_sf)
# score_scaled = scale(CA_data$Score)
c_index = unique(CA_data$County)
y_index = unique(CA_data$Year)
SDI_df = matrix(nrow=10,ncol=58)
track1 = 1
track2 = 1
for (i in c_index){
for (j in y_index){
scores = CA_data %>% filter(County == i) %>% filter(Year == j) %>% select(Score) %>% unique()
SDI_df[track1,track2] = scores$Score
track1 = track1 + 1
}
track1 = 1
track2 = track2 + 1
}
score_scaled = scale(SDI_df) #NEED TO SCALE THE DATA BEFORE FEEDING IT INTO SKATER
#covariates_scale = data.frame(apply(CA_data[,4:16],2,scale))
covariates_scale = data.frame(t(score_scaled))
CA_spdf@data = covariates_scale
Using the SPDF from above, we follow the steps of SKATER tutorial (https://www.dshkol.com/post/spatially-constrained-clustering-and-regionalization/) to generate three separate clustering results: (1) Unconstrained/default (2) Clusters have minimum population constraint based on the total population / # of clusters (3) Clusters are comprised of a minimum number of counties (8 for smaller number of clusters, 4 for bigger numbers)
#Identify neighborhood list for counties
CA_nb = poly2nb(CA_spdf)
#summary(CA_nb)
# plot(CA_spdf, main = "With queen")
# plot(CA_nb, coords = coordinates(CA_spdf), col="blue", add = TRUE)
#Calculate edge costs (dissimilarity matrix) based on Euclidean distance
costs <- nbcosts(CA_nb, data = covariates_scale)
###Get adjacency matrix using nb2mat() (SEPARATE STEP FOR INLA)
adj = nb2mat(CA_nb,style = "B")
#Style means the coding scheme style used to create the weighting matrix
# B: basic binary coding scheme
# W: row standardized coding scheme
# C: globally standardized coding scheme
# U: values of C / number of neighbors
# S: variance stabilizing coding scheme
#Transform edge costs to spatial weights
ct_w <- nb2listw(CA_nb,costs,style="B")
#Create minimum spanning tree
ct_mst <- mstree(ct_w)
plot(ct_mst,coordinates(CA_spdf),col="blue", cex.lab=0.5)
plot(CA_spdf, add=TRUE)
#Run SKATER algorithm to get 7 contiguous clusters (cluster idx is in order of CA_sf)
clus7 <- skater(edges = ct_mst[,1:2], data = covariates_scale, ncuts = 6)
#Determine an appropriate minimum population threshold based on???
pops_summary = summary(unique(CA_data$Total_Pop))
pops_summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12700 63275 219705 710796 750235 10105722
#Idea 1: Use median * (how many counties should be in a cluster at minimum)
min_pop = as.numeric(pops_summary[3] * 4)
#Idea 2: If we assume CA population is 39M, divide total pop by # clusters
min_pop2 = 39000000 / 7
#Add a min population constraint
clus7_min <- skater(edges = ct_mst[,1:2],
data = covariates_scale,
crit = min_pop2,
vec.crit = CA_data$Total_Pop,
ncuts = 6)
#Add a minimum number of areas in each cluster constraint
clus7_minarea = skater(edges = ct_mst[,1:2], data = covariates_scale, ncuts = 6, 4)
CA_data_cluster = (CA_sf %>% mutate(clus = clus7_min$groups))
#Plot clustered CA
plot((CA_sf %>% mutate(clus = clus7$groups))['clus'], main = "7 cluster example")
plot((CA_sf %>% mutate(clus = clus7_min$groups))['clus'], main = "7 cluster example with population constraint")
plot((CA_sf %>% mutate(clus = clus7_minarea$groups))['clus'], main = "7 cluster example with minimum number of areas constraint")
#plot(CA_sf,col=c("red","green","blue","purple","yellow")[clus7_min$groups],max.plot=17)
For reference, here are the cluster labels for each county:
clusterlabels = data.frame(CA_data_cluster$NAME,clus7_min$groups)
names(clusterlabels) = c("counties","Cluster")
o = order(clusterlabels$counties)
clusterlabels = clusterlabels[o,]
rownames(clusterlabels) = NULL
clusterlabels
## counties Cluster
## 1 Alameda 4
## 2 Alpine 5
## 3 Amador 5
## 4 Butte 2
## 5 Calaveras 5
## 6 Colusa 1
## 7 Contra Costa 5
## 8 Del Norte 2
## 9 El Dorado 1
## 10 Fresno 3
## 11 Glenn 1
## 12 Humboldt 2
## 13 Imperial 7
## 14 Inyo 4
## 15 Kern 4
## 16 Kings 4
## 17 Lake 5
## 18 Lassen 6
## 19 Los Angeles 7
## 20 Madera 3
## 21 Marin 5
## 22 Mariposa 5
## 23 Mendocino 2
## 24 Merced 5
## 25 Modoc 6
## 26 Mono 3
## 27 Monterey 4
## 28 Napa 5
## 29 Nevada 1
## 30 Orange 7
## 31 Placer 1
## 32 Plumas 6
## 33 Riverside 7
## 34 Sacramento 5
## 35 San Benito 3
## 36 San Bernardino 4
## 37 San Diego 7
## 38 San Francisco 5
## 39 San Joaquin 4
## 40 San Luis Obispo 4
## 41 San Mateo 4
## 42 Santa Barbara 4
## 43 Santa Clara 4
## 44 Santa Cruz 4
## 45 Shasta 2
## 46 Sierra 1
## 47 Siskiyou 6
## 48 Solano 5
## 49 Sonoma 2
## 50 Stanislaus 5
## 51 Sutter 1
## 52 Tehama 2
## 53 Trinity 6
## 54 Tulare 4
## 55 Tuolumne 5
## 56 Ventura 4
## 57 Yolo 1
## 58 Yuba 6
counties = clusterlabels$counties
num_clus = max(clus7_min$groups)
This code chunk takes the cluster grouping from SKATER and aggregates the full dataframe from the SPDF (58x10) into a 10x7 matrix (10 time points x 7 clusters) to be fed into the graph estimation package HUGE. The data from each county in a given cluster is aggregated based on a population weighted mean.
HUGE uses glasso to estimate a graph structure based on the aggregated feature data which recall, is the SDI score (socioeconomic status) from the SoA. We use a grid of lambda values under 1 in order to ensure that some edges will be present in the estimates produced by HUGE. This decision is supported by the fact that partial correlations calculated via regression appear to be statistically significant. Based on simulation results, we believe that EBIC is a suitable criterion for choosing the best estimated graph in the huge.select() step.
The objective function for EBIC is as follows: \(EBIC_{\gamma}(s) = -2log L_n \{ \hat \theta(s) \} + \nu(s) log n + 2 \gamma log \tau(S_j)\) where \(0 \leq \gamma \leq 1\) and \(S_j\) is model space of size \(\tau(S_j)\)
#Aggregate feature vectors into one vector for each SKATER cluster
CA_cluster = data.frame(CA_sf$NAMELSAD,clus7_min$groups)
names(CA_cluster) = c("County","Cluster")
year = 2010:2019
CA_cluster = left_join(CA_cluster,CA_data,by = "County")
#Get weighted avg value for Score for each cluster for each year
#Create new data matrix of aggregated feature vectors
cluster_features = matrix(NA,nrow = 10,ncol = 7)
for (i in 1:7){
cluster = CA_cluster %>% filter(Cluster == i)
for(j in 1:10){
#Obtain a weighted mean based on population
vec = cluster %>% filter(Year == year[j]) %>% select(Score,Total_Pop) %>% unique()
cluster.pop = sum(vec$Total_Pop)
cluster.popweights = vec$Total_Pop/cluster.pop
cluster_features[j,i] = weighted.mean(vec$Score,cluster.popweights)
}
}
#Graph learning w HUGE
out.glasso = huge(cluster_features,lambda = seq(0.95,0.05,by=-0.05),method="glasso")
## Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 5%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 10%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 15%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 21%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 26%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 31%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 36%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 42%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 47%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 52%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 57%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 63%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 68%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 73%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 78%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 84%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 89%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 94%Conducting the graphical lasso (glasso) wtih lossless screening....in progress: 100%
## Conducting the graphical lasso (glasso)....done.
glasso.stars = huge.select(out.glasso,criterion = "stars",stars.thresh = 0.1)
## Conducting Subsampling....in progress:5% Conducting Subsampling....in progress:10% Conducting Subsampling....in progress:15% Conducting Subsampling....in progress:20% Conducting Subsampling....in progress:25% Conducting Subsampling....in progress:30% Conducting Subsampling....in progress:35% Conducting Subsampling....in progress:40% Conducting Subsampling....in progress:45% Conducting Subsampling....in progress:50% Conducting Subsampling....in progress:55% Conducting Subsampling....in progress:60% Conducting Subsampling....in progress:65% Conducting Subsampling....in progress:70% Conducting Subsampling....in progress:75% Conducting Subsampling....in progress:80% Conducting Subsampling....in progress:85% Conducting Subsampling....in progress:90% Conducting Subsampling....in progress:95% Conducting Subsampling....in progress:100% Conducting Subsampling....done.
glasso.ric = huge.select(out.glasso,criterion = "ric")
## Conducting rotation information criterion (ric) selection....done
## Computing the optimal graph....done
glasso.ebic = huge.select(out.glasso,criterion = "ebic")
## Conducting extended Bayesian information criterion (ebic) selection....done
plot(glasso.stars)
plot(glasso.ric)
plot(glasso.ebic)
huge.est = glasso.ebic$refit
huge.est
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 0 1 1 1 0 1 1
## [2,] 1 0 1 0 1 1 1
## [3,] 1 1 0 1 0 1 0
## [4,] 1 0 1 0 1 0 1
## [5,] 0 1 0 1 0 0 1
## [6,] 1 1 1 0 0 0 1
## [7,] 1 1 0 1 1 1 0
#Identify which clusters/nodes are the most connected on the graph i.e. has the most association with the other nodes
degree_connectivity = data.frame(colSums(huge.est))
colnames(degree_connectivity) = "node_connections"
degree_connectivity = cbind(c(1:num_clus),degree_connectivity)
install.packages("igraph")
library(igraph)
# Convert the adjacency matrix to a graph object
g <- graph_from_adjacency_matrix(huge.est, mode = "undirected")
# Assign custom labels to vertices
V(g)$name <- c(1,2,3,4,5,6,7)
# Assign colors to vertices
V(g)$color <- c("red", "cyan", "green", "yellow", "purple", "orange", "pink")
# Plot the graph with labeled vertices
plot(g, vertex.label = V(g)$name, vertex.color = V(g)$color, vertex.size = 20)
The code below takes the adjacency matrix estimated in the previous step and transforms it into a graph filter H. The steps are explained in Antonian et al 2019 (Gareth Peters’ paper). The cutoff transformation is applied to the eigenvalues of the graph Laplacian.
A = as.matrix(huge.est)
p = nrow(A)
#obtain graph Laplacian L
D = diag(p)
for (i in 1:p){
d = sum(A[,i])
D[i,i] = d
}
L = D - A
#eigendecomposition of L
Ldecomp = eigen(L)
U = as.matrix(Ldecomp$vectors)
Lambdas = Ldecomp$values
#test
#U %*% (diag(p)*Lambdas) %*% t(U)
#Function implementing cutoff tranform for eigenvalues
cutoff.transform = function(lambdas,q){
transformed = c()
cutoff = quantile(lambdas,q)
for (i in lambdas){
if(i <= cutoff){
transformed = c(transformed,1)
}
else{
transformed = c(transformed,0)
}
}
return(transformed)
}
#quantile(Lambdas,2/3)
transformed.L = cutoff.transform(Lambdas,2/3)
eta.L = diag(p)*transformed.L
#obtain graph filter
H = U %*% eta.L %*% t(U)
H
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0.46065322 0.29315326 0.074264113 0.15087817 -0.215044073 -0.055329344
## [2,] 0.29315326 0.58851806 0.163559807 -0.28070511 0.150878170 -0.068010981
## [3,] 0.07426411 0.16355981 0.824846850 0.13992515 0.002116461 0.086945750
## [4,] 0.15087817 -0.28070511 0.139925150 0.80121219 0.086945750 -0.061815959
## [5,] -0.21504407 0.15087817 0.002116461 0.08694575 0.909676139 -0.008836559
## [6,] -0.05532934 -0.06801098 0.086945750 -0.06181596 -0.008836559 0.956168924
## [7,] 0.29142465 0.15260679 -0.291658130 0.16355981 0.074264113 0.150878170
## [,7]
## [1,] 0.29142465
## [2,] 0.15260679
## [3,] -0.29165813
## [4,] 0.16355981
## [5,] 0.07426411
## [6,] 0.15087817
## [7,] 0.45892461
gfilter_weight = norm((1/7)*H^2,type = "F")
# gfilter_weight = norm(H^2,type = "F")
matrix_heatmap= function(matrix,title = "",gradient_zones = c(0,0.5,0.999)){
r = nrow(matrix)
df = as_tibble(cbind(expand.grid(rev(seq_len(r)),seq_len(r)),c(matrix))) %>% setNames(c("row","col","value"))
df$value[df$value == 1] = 0.999
x_min = min(df$row) - 0.5
x_max = max(df$row) + 0.5
y_min = min(df$col) - 0.5
y_max = max(df$col) + 0.5
plot = ggplot(df,mapping = aes(x=row,y=col,fill=value)) + geom_tile() +
geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max),
fill = NA, color = "black", inherit.aes = FALSE) +
scale_fill_gradientn(colors = c("white","lightblue","darkblue"),
values = rescale(gradient_zones),
limits = c(0, 0.99),
oob = squish) + ggtitle(title) + theme_void() +
theme(plot.margin = margin(t = 10, r = 30, b = 10, l = 10))
return(plot)
}
#Heatmap of resulting H
corrplot(H, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
title = "Graph filter")
matrix_heatmap(H,title = "")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 49 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
library(tidyverse)
library(plyr)
library(dplyr)
library(jsonlite)
library(lubridate)
library(ggplot2)
library(maps)
library(mapdata)
library(geosphere)
library(urbnmapr)
library(RAQSAPI)
library(con2aqi)
aqs_credentials("jeffreywu@ucsb.edu","copperheron86")
#Get county codes
counties_url = "https://aqs.epa.gov/data/api/list/countiesByState?email=jeffreywu@ucsb.edu&key=copperheron86&state=06&"
countycodes = fromJSON(counties_url)
countycodes = countycodes[[2]]
california_counties = countycodes$code
#Get parameter codes
parameters_url = "https://aqs.epa.gov/data/api/list/parametersByClass?email=jeffreywu@ucsb.edu&key=copperheron86&pc=CRITERIA"
parametercodes = fromJSON(parameters_url)
parametercodes = parametercodes[[2]]
parametercodes = parametercodes[-7,]
pollutants = data.frame(parametercodes$code)
labels = c("lead","co","so2","no2","o3","pm10","pm25")
pollutants = cbind(pollutants,labels)
The goal here is to query all of the EPA measurements for all 7 pollutants and AQI from 2014-2019. The first step is to query all stations in California that collect measurements for at least one of the 7 pollutants.
FUNCTION THAT QUERIES STATION LOCATIONS FOR A GIVEN POLLUTANT
query_aqs_station_data <- function(param,year){
start_date <- paste0(year, "0101")
end_date <- paste0(year, "1231")
url <- paste0("https://aqs.epa.gov/data/api/monitors/byState?email=jeffreywu@ucsb.edu&key=copperheron86¶m=", param, "&bdate=", start_date, "&edate=", end_date, "&state=06")
myData <- fromJSON(url)
station_data = myData[[2]]
return(station_data)
}
FOR EACH POLLUTANT, GRAB ALL MONITORING STATIONS FOR EACH YEAR
stations_url = "https://aqs.epa.gov/data/api/monitors/byState?email=jeffreywu@ucsb.edu&key=copperheron86¶m=88101&bdate=20140101&edate=20141231&state=06"
stations = fromJSON(stations_url)
stations = stations[[2]]
station_data2014_pm2.5 = stations %>% select(latitude,longitude,site_number, local_site_name,county_code,county_name)
#Get monitoring station locations for each pollutant for each year (takes approx 3 min)
all_pollutants_station_data <- list()
for (year in 2014:2019){
year_data <- lapply(pollutants, query_aqs_station_data, year = year)
all_pollutants_station_data[[as.character(year)]] <- year_data
}
# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5
all_pollutants_station_2014data = all_pollutants_station_data[[1]]
all_pollutants_station_2015data = all_pollutants_station_data[[2]]
all_pollutants_station_2016data = all_pollutants_station_data[[3]]
all_pollutants_station_2017data = all_pollutants_station_data[[4]]
all_pollutants_station_2018data = all_pollutants_station_data[[5]]
all_pollutants_station_2019data = all_pollutants_station_data[[6]]
Starting off with a list of all the stations measuring each parameter/pollutant, we want to identify a subset of stations that measure each pollutant for each county. This overall set of stations (subset for each county combined together) should have a good spatial coverage of the state.
This is difficult because there is not a station in every county measuring each pollutant. So in order to identify a good set of stations to query data from, I first looked up the two largest cities in each county, based on population.
Lat/long for 2 biggest cities (based on population) in each county
citylats = c(37.8044,37.5485,38.7743,38.8071,38.3527,38.3488,39.7285,39.7596,
38.1231,38.0678,39.2143,39.1546,37.9780,38.0049,41.7558,41.7548,
38.9399,38.6688,36.7378,36.8252,39.7474,39.5243,40.8021,40.8665,
32.7920,32.6789,37.3614,37.3855,35.3733,35.7688,36.3275,36.3008,
38.9582,38.8080,40.4163,40.2840,34.0522,33.7701,36.9613,37.1230,
37.9735,38.1074,37.4849,37.4320,39.4457,39.4096,37.3022,37.0583,
41.4871,41.4099,37.6485,38.5149,36.6777,36.6149,38.2975,38.1749,
39.3280,39.2191,33.8366,33.7455,38.7521,38.7907,39.9341,40.3063,
33.9806,33.9425,38.5816,38.4088,36.8525,36.8125,34.1083,34.0922,
32.7157,32.6401,37.7749,37.9780,37.9577,37.7396,35.2828,35.6369,
37.6879,37.5630,34.9530,34.4208,37.3387,37.3688,36.9741,36.9102,
40.5865,40.4482,39.6763,39.5595,41.7354,41.3099,38.1041,38.2492,
38.4404,38.2324,37.6393,37.4946,39.1404,39.1165,40.1785,39.9277,
40.7310,40.4156,36.3301,36.2077,38.0297,37.9829,34.1975,34.1706,
38.5449,38.6785,39.1277,39.0954)
citylongs = c(122.2712,121.9886,119.8219,119.7960,120.9327,120.7741,121.8375,121.6219,
120.8509,120.5385,122.0094,122.1494,122.0311,121.8058,124.2026,124.1580,
119.9772,120.9872,119.7871,119.7029,122.1964,122.1936,124.1637,124.0828,
115.5631,115.4989,118.3997,118.4105,119.0187,119.2471,119.6457,119.7829,
122.6264,122.5583,120.6530,120.5394,118.2437,118.1937,120.0607,120.2602,
122.5311,122.5697,119.9663,120.0985,123.8053,123.3556,120.4830,120.8499,
120.5425,120.6791,118.9721,119.4768,121.6555,121.8221,122.2869,122.2608,
120.1833,121.0611,117.9143,117.8677,121.2880,121.2358,120.8980,121.2319,
117.3755,117.2297,121.4944,121.3716,121.4016,121.3658,117.2898,117.4350,
117.1611,117.0842,122.4194,122.0311,121.2908,121.4260,120.6596,120.6545,
122.4702,122.3255,120.4357,119.6982,121.8853,122.0363,122.0308,121.7569,
122.3917,122.2978,120.2410,120.8277,122.6345,122.3106,122.2566,122.0405,
122.7141,122.6367,120.9970,120.8460,121.6169,121.6380,122.2358,122.1792,
122.9420,123.2100,119.2966,119.3473,119.9741,120.3822,119.1771,118.8376,
121.7405,121.7733,121.5508,121.5522)
citylongs = -1*citylongs
Alameda: Oakland (429082) and Fremont
Alpine: Alpine Village (225) and Mesa Vista
Amador: Ione (8363) and Jackson
Butte: Chico (94776) and Paradise
Calaveras: Rancho Calaveras (5324) and Angels Camp
Colusa: Colusa (5911) and Williams
Contra Costa: Concord (129688) and Antioch
Del Norte: Crescent City (6805) and Bertsch-Oceanview
El Dorado: South Lake Tahoe (22036) and Cameron Park
Fresno: Fresno (530093) and Clovis
Glenn: Orland (7644) and Willows
Humboldt: Eureka (26998) and Arcata
Imperial: El Centro (44120) and Calexico
Inyo: Bishop (3746) and Dixon Lane-Meadow Creek
Kern: Bakersfield (383579) and Delano
Kings: Hanford (56910) and Lemoore
Lake: Clearlake (15384) and Hidden Valley Lake
Lassen: Susanville (15165) and Janesville
Los Angeles: Los Angeles (3990000) and Long Beach
Madera: Madera (65706) and Chowchilla
Marin: San Rafael (58704) and Novato
Mariposa: Mariposa (1526) and Catheys Valley
Mendocino: Fort Bragg (7359) and Willits
Merced: Merced (83316) and Los Banos
Modoc: Alturas (2509) and California Pines
Mono: Mammoth Lakes (8127) and Walker
Monterey: Salinas (156259) and Seaside
Napa: Napa (79263) and American Canyon
Nevada: Truckee (16561) and Grass Valley
Orange: Anaheim (352005) and Santa Ana
Placer: Roseville (139117) and Rocklin
Plumas: East Quincy (2489) and Chester
Riverside: Riverside (330063) and Moreno Valley
Sacramento: Sacramento (508529) and Elk Grove
San Benito: Hollister (39749) and Ridgemark
San Bernandino: San Bernandino (215941) and Fontana
San Diego: San Diego (1426000) and Chula Vista
San Francisco: San Francisco (810000) and Concord
San Joaquin: Stockton (311178) and Tracy
San Luis Obispo: San Luis Obispo (47446) and Paso Robles
San Mateo: Daly City (107008) and San Mateo
Santa Barbara: Santa Maria (107408) and Santa Barbara
Santa Clara: San Jose (1030000) and Sunnyvale
Santa Cruz: Santa Cruz (64725) and Watsonville
Shasta: Redding (91772) and Anderson
Sierra: Loyalton (700) and Downieville
Siskiyou: Yreka (7556) and Mount Shasta
Solano: Vallejo (121913) and Fairfield
Sonoma: Santa Rosa (177586) and Petaluma
Stanislaus: Modesto (215030) and Turlock
Sutter: Yuba City and South Yuba City
Tehama: Red Bluff (14283) and Corning
Trinity: Weaverville (3667) and Post Mountain
Tulare: Visalia (133800) and Tulare
Tuolumne: Phoenix Lake-Cedar Ridge (5108) and Sonora
Ventura: Oxnard (209877) and Thousand Oaks
Yolo: Davis (69289) and Woodland
Yuba: Linda (17773) and Olivehurst
Then, I created the function below to choose a group of stations that are within a certain distance (Haversine distance from the latitude and longitude) of each city that I identified in the previous step. If there are less than 5 stations associated to a given city, the distance threshold (which starts at 100km) is increased by 50km.
FUNCTION THAT SELECTS SET OF STATIONS CLOSEST TO A GIVEN LAT/LONG
# Function to filter stations based on spatial coverage
subset_stations_by_spatial_coverage <- function(station_data, reference_lat, reference_lon, max_distance_km=100) {
# Calculate distances between stations and reference location
distances <- distHaversine(
cbind(station_data$longitude, station_data$latitude),
c(reference_lon, reference_lat)
)
distances <- distances/1000
# idx = which(distances == min(distances))
# #Identify station within min distance to centroid of county
# station_data_subset <- station_data[idx, ]
# Subset stations within the specified max_distance_km
idx = which(distances <= max_distance_km)
station_data_subset <- station_data[idx, ]
station_data_subset <- cbind(station_data_subset,distances[idx])
while (nrow(station_data_subset) < 5){
max_distance_km = max_distance_km + 50
station_data_subset = subset_stations_by_spatial_coverage(station_data,
reference_lat, reference_lon, max_distance_km)
}
return(station_data_subset)
}
# # Construct the subset of stations based on spatial coverage criteria (test)
# reference_lat = citylats[3]
# reference_lon = citylongs[3]
# max_distance_km = 100
#
# subset_stations <- subset_stations_by_spatial_coverage(station_data2014_pm2.5, reference_lat, reference_lon, max_distance_km)
#
# # Print the subset of stations
# print(subset_stations)
# Obtain centroid lat/longs for each county
CA.counties2 = read.csv("counties.ca.data.csv")
ca.coordinates = data.frame(CA.counties2$county,CA.counties2$lat,CA.counties2$lng)
colnames(ca.coordinates) = c("county","lat","long")
ca.coordinates = ca.coordinates[order(ca.coordinates$county),]
row.names(ca.coordinates) = NULL
IMPORTANT FUNCTION:
Given a dataset containing station locations/codes for a given pollutant and year, the function below selects 5-20 stations that are closest to the lat/longs for the two biggest cities in each county and puts the station information (code, lat, long, etc) into a dataframe.
#Function that finds best monitoring station for each county for a specific pollutant for a specific year
# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5
best_stations = function(stationdata,pollutant){
subset_list = list()
#Load lat/longs for 58x2 cities into dataframe
CA.coords = data.frame(rep(countycodes$value_represented,each = 2),citylats,citylongs)
colnames(CA.coords) = c("County","Lat","Long")
#Find closest station for each county centroid using subset_stations_by_spatial_coverage function
for (i in 1:nrow(CA.coords)){
reference_lat = CA.coords$Lat[i]
reference_lon = CA.coords$Long[i]
max_distance_km = 100
subset_stations <- subset_stations_by_spatial_coverage(stationdata[[pollutant]], reference_lat, reference_lon, max_distance_km)
subset_list[[i]] = subset_stations
}
#Combine pairs of city lists together
subset_list2 = list()
sequence = seq(2,116,2)
for(i in sequence){
combine = rbind(subset_list[[i]],subset_list[[i-1]])
subset_list2[[i-1]] = combine
}
subset_list2 =subset_list2[!sapply(subset_list2,is.null)]
#Create a county label vector
repnames = c()
for(i in 1:58){
repnames = c(repnames,nrow(subset_list2[[i]]))
}
countylabels = rep(countycodes$value_represented,times = repnames)
#Format the list into dataframe
beststations = as.data.frame(do.call(rbind, subset_list2))
beststations = cbind(countylabels,beststations$county_name,
beststations$`distances[idx]`,beststations)
colnames(beststations)[c(1,2,3)] = c("measuring_county","station_county","distance_apart")
rownames(beststations) = NULL
return(beststations)
}
# #test cases
# pm2.5_stations_2014 = best_stations(all_pollutants_station_2014data,7)
# CO_stations_2016 = best_stations(all_pollutants_station_2016data,2)
CREATING BEST STATION LIST/DATAFRAME FOR EACH POLLUTANT, EACH ENTRY IS A YEAR
#Generate list for best stations for each pollutant for each year
Lead_stations = list()
Lead_stations[[1]] = best_stations(all_pollutants_station_2014data,1)
Lead_stations[[2]] = best_stations(all_pollutants_station_2015data,1)
Lead_stations[[3]] = best_stations(all_pollutants_station_2016data,1)
Lead_stations[[4]] = best_stations(all_pollutants_station_2017data,1)
Lead_stations[[5]] = best_stations(all_pollutants_station_2018data,1)
Lead_stations[[6]] = best_stations(all_pollutants_station_2019data,1)
CO_stations = list()
CO_stations[[1]] = best_stations(all_pollutants_station_2014data,2)
CO_stations[[2]] = best_stations(all_pollutants_station_2015data,2)
CO_stations[[3]] = best_stations(all_pollutants_station_2016data,2)
CO_stations[[4]] = best_stations(all_pollutants_station_2017data,2)
CO_stations[[5]] = best_stations(all_pollutants_station_2018data,2)
CO_stations[[6]] = best_stations(all_pollutants_station_2019data,2)
SO2_stations = list()
SO2_stations[[1]] = best_stations(all_pollutants_station_2014data,3)
SO2_stations[[2]] = best_stations(all_pollutants_station_2015data,3)
SO2_stations[[3]] = best_stations(all_pollutants_station_2016data,3)
SO2_stations[[4]] = best_stations(all_pollutants_station_2017data,3)
SO2_stations[[5]] = best_stations(all_pollutants_station_2018data,3)
SO2_stations[[6]] = best_stations(all_pollutants_station_2019data,3)
NO2_stations = list()
NO2_stations[[1]] = best_stations(all_pollutants_station_2014data,4)
NO2_stations[[2]] = best_stations(all_pollutants_station_2015data,4)
NO2_stations[[3]] = best_stations(all_pollutants_station_2016data,4)
NO2_stations[[4]] = best_stations(all_pollutants_station_2017data,4)
NO2_stations[[5]] = best_stations(all_pollutants_station_2018data,4)
NO2_stations[[6]] = best_stations(all_pollutants_station_2019data,4)
O3_stations = list()
O3_stations[[1]] = best_stations(all_pollutants_station_2014data,5)
O3_stations[[2]] = best_stations(all_pollutants_station_2015data,5)
O3_stations[[3]] = best_stations(all_pollutants_station_2016data,5)
O3_stations[[4]] = best_stations(all_pollutants_station_2017data,5)
O3_stations[[5]] = best_stations(all_pollutants_station_2018data,5)
O3_stations[[6]] = best_stations(all_pollutants_station_2019data,5)
PM10_stations = list()
PM10_stations[[1]] = best_stations(all_pollutants_station_2014data,6)
PM10_stations[[2]] = best_stations(all_pollutants_station_2015data,6)
PM10_stations[[3]] = best_stations(all_pollutants_station_2016data,6)
PM10_stations[[4]] = best_stations(all_pollutants_station_2017data,6)
PM10_stations[[5]] = best_stations(all_pollutants_station_2018data,6)
PM10_stations[[6]] = best_stations(all_pollutants_station_2019data,6)
# Lead.PM10_stations = list()
#
# Lead.PM10_stations[[1]] = best_stations(all_pollutants_station_2014data,7)
# Lead.PM10_stations[[2]] = best_stations(all_pollutants_station_2015data,7)
# Lead.PM10_stations[[3]] = best_stations(all_pollutants_station_2016data,7)
# Lead.PM10_stations[[4]] = best_stations(all_pollutants_station_2017data,7)
# Lead.PM10_stations[[5]] = best_stations(all_pollutants_station_2018data,7)
# Lead.PM10_stations[[6]] = best_stations(all_pollutants_station_2019data,7)
PM2.5_stations = list()
PM2.5_stations[[1]] = best_stations(all_pollutants_station_2014data,7)
PM2.5_stations[[2]] = best_stations(all_pollutants_station_2015data,7)
PM2.5_stations[[3]] = best_stations(all_pollutants_station_2016data,7)
PM2.5_stations[[4]] = best_stations(all_pollutants_station_2017data,7)
PM2.5_stations[[5]] = best_stations(all_pollutants_station_2018data,7)
PM2.5_stations[[6]] = best_stations(all_pollutants_station_2019data,7)
Do the stations provide a good spatial coverage of California? To me, the coverage is reasonable especially because most of the northern and eastern counties are where most of the sparsely populated counties are located. There are probably not that many EPA stations there as a result.
pollutants1_2015 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants1_2015_8.17.RData")
pollutants2_2015 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2015_8.17.RData")
stationlats = c(unique(pollutants1_2015$latitude),unique(pollutants2_2015$latitude))
stationlongs = c(unique(pollutants1_2015$longitude),unique(pollutants2_2015$longitude))
station_points = data.frame(stationlats,stationlongs)
#Plot
gg_pop_stations <- ggplot() +
geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = pop),
color = "black") +
coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
theme_void() +
labs(title = "Heatmap of County Populations with Station Locations for 2015") +
scale_fill_gradient(low = "lightblue", high = "darkblue")
# Add points
gg_pop_stations <- gg_pop_stations +
geom_point(data = station_points, aes(x = stationlongs, y = stationlats),
color = "red", size = 1.5)
print(gg_pop_stations)
Given a set of 5-20 monitoring stations for each county, we loop through its station codes (for each year 2014-2019) and query using the EPA’s AQS function. This function only allows you to query a maximum of 4 parameters at once for a single year, so two calls to the function have to be made for each year (4 and 3).
(BELOW SHOWS IT BEING DONE FOR 2019)
END GOAL FINAL FORM: ONE BIG DATAFRAME (ALL POLLUTANTS ALL YEARS TOGETHER, USE FILTER TO SEPARATE)
# 1 - Lead, 2 - Carbon monoxide (CO), 3 - Sulfure dioxide (SO2), 4 - Nitrogen dioxide (NO2)
# 5 - Ozone (O3), 6 - Total PM10, 7 - PM2.5
stations2019x = rbind(Lead_stations[[6]],CO_stations[[6]],
SO2_stations[[6]],NO2_stations[[6]])
stations2019y = rbind(O3_stations[[6]],PM10_stations[[6]],PM2.5_stations[[6]])
sitenums2019x = stations2019x %>% select(county_code,site_number) %>% unique() #198 stations
sitenums2019y = stations2019y %>% select(county_code,site_number) %>% unique() #178 stations
#Trying EPA R Package query (took 15 + 20 min!) gives us a dataframe
ccodes = sitenums2019y$county_code
snums = sitenums2019y$site_number
str1 = "2019-01-01"
str2 = "2019-12-31"
pollutants1_2019 = aqs_dailysummary_by_site(parameter = c("14129","42101","42401","42602"),bdate = as.Date(str1),edate = as.Date(str2),stateFIPS = "06",countycode = ccodes,sitenum = snums)
pollutants2_2019 = aqs_dailysummary_by_site(parameter = c("44201","81102","88101"),bdate = as.Date(str1),edate = as.Date(str2),stateFIPS = "06",countycode = ccodes,sitenum = snums)
###SAVE LIST LOCALLY
saveRDS(pollutants2_2019,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2019_8.18.RData")
After querying the raw data for each pollutant and each year, we want to go through it and remove any data from stations that have “bad” data. The standards that I set are that for a given year, a station should have at least 240 of the 365 days included for a year of data. Additionally, I applied a Hampel filter to identify outliers and if there are more than 14 consecutive outliers in the data i.e., days in a row with measurements that are abnormal, I considered the data from that station to not be suitable for inclusion in the final dataset.
NOTE: the daily values reported in the raw dataset are actually daily averages from periodic measurements made by the station throughout the day
After data from so called “bad” stations were removed, another function is applied which aggregates the daily observations into a monthly median The raw data is in the form of a single dataframe, which is fed into the important function raw_transform(). This function uses several functions to create a list of dataframes, one for each county, which represent the monthly median measurements for a certain pollutant in a given county in a given year.
QUALITY CHECK FUNCTION FOR STATION DATA: WANT TO ADDRESS OUTLIERS, MISSINGNESS
#Given a dataset like CO2016 (list of 1000ish stations), check for 2/3 missing data and for strings of outliers (14 in a row)
station_quality_check = function(station_data){
l = length(station_data)
badindex = c()
consecutive_outliers = list()
for (i in 1:l){
aqi = station_data[[i]]$aqi
pollutant_level = station_data[[i]]$arithmetic_mean
#check for outliers in AQI
median.aqi = median(na.omit(aqi))
mad.aqi = mad(na.omit(aqi))
min.aqi = median.aqi-(3*mad.aqi)
max.aqi = median.aqi+(3*mad.aqi)
outliers.aqi = which(aqi < min.aqi | aqi > max.aqi)
result.aqi = rle(diff(outliers.aqi))
#check for outliers in pollutant measure
median.pollutant = median(na.omit(pollutant_level))
mad.pollutant = mad(na.omit(pollutant_level))
min.pollutant = median.pollutant-(3*mad.pollutant)
max.pollutant = median.pollutant+(3*mad.pollutant)
outliers.pollutant = which(pollutant_level < min.pollutant | pollutant_level > max.pollutant)
result.pollutants = rle(diff(outliers.pollutant))
if (nrow(station_data[[i]]) < 240){
badindex = c(badindex,i)
}
else if (any(result.aqi$lengths >= 14 & result.aqi$values == 1) == TRUE){
badindex = c(badindex,i)
}
else if (any(result.pollutants$lengths >= 14 & result.pollutants$values == 1) == TRUE){
badindex = c(badindex,i)
}
consecutive_outliers[[i]] = c("AQI",outliers.aqi,"POLLUTANTS",outliers.pollutant)
}
bad_list = list(badindex,consecutive_outliers)
return(bad_list)
}
#Test on CO2016 and CO2017
# station_quality_check(CO2016) #returns 61 "bad stations" out of 1230
#
# removeidx = station_quality_check(SO22017)[[1]] #returns 715 out of 1056 "bad stations"
#
# test = SO22017[- removeidx]
FUNCTION THAT AGGREGATES DAILY DATA INTO MONTHYLY MEDIANS
monthly_agg = function(pollutantdata){
#Aggregating all the station data at once
date = ymd(pollutantdata$date_local)
df2 <- pollutantdata # Duplicate data
df2$year_month <- floor_date(date,"month") # Create year-month column
df3 = df2 %>% select(county,site_number,arithmetic_mean,aqi,year_month) %>% as.data.frame()
df3$arithmetic_mean = as.numeric(df3$arithmetic_mean)
df3$aqi[which(df3$aqi == "NULL")] = NA
df3$aqi = as.numeric(df3$aqi)
df.agg = df3 %>% group_by(year_month) %>% dplyr::summarize(arithmetic_mean = median(na.omit(arithmetic_mean)),aqi = median(na.omit(aqi))) %>% as.data.frame()
return(df.agg)
}
IMPORTANT FUNCTION: TRANSFORMING RAW DATA TO FINAL FORM
# Group 1: 14129 - Lead, 421012 - Carbon monoxide (CO), 42401 - Sulfure dioxide (SO2), 42602 - Nitrogen dioxide (NO2)
# Group 2: 44201 - Ozone (O3), 81102 - Total PM10, 88101 - PM2.5
raw_transform = function(rawdata,reference_list,standard){
###SEPARATE DF INTO A LIST OF DFs
matched_list = list()
if(missing(standard)){
for (i in 1:nrow(reference_list)){
data = rawdata %>% filter(county_code == reference_list$county_code[i], site_number == reference_list$site_number[i])
matched_list[[i]] = data
}
} else {
for (i in 1:nrow(reference_list)){
data = rawdata %>% filter(county_code == reference_list$county_code[i], site_number == reference_list$site_number[i],pollutant_standard == standard)
matched_list[[i]] = data
}
}
names(matched_list) = reference_list$measuring_county
###STATION QUALITY CHECK
removeidx = station_quality_check(matched_list)[[1]]
good_matched_list = matched_list[- removeidx]
#Convert list back into one big dataframe
temp = as.data.frame(do.call(rbind, good_matched_list)) #TOO MANY ROWS RIGHT?
good_df = unique.data.frame(temp)
###MAKE A LIST OF COMBINED STATION DATA FOR EACH COUNTY
mid_list = list()
for (i in unique(reference_list$measuring_county)){
df_new = data.frame(good_df[1,])
subset = reference_list %>% filter(measuring_county == i) %>% select(county_code,site_number)
for (j in 1:nrow(subset)){
pull = good_df %>% filter(county_code == reference_list$county_code[j], site_number == reference_list$site_number[j])
df_new = rbind(df_new,pull)
}
df_new = df_new[-1,]
mid_list[[i]] = df_new
}
###AGGREGATE DAILY DATA TO MONTHLY FOR EACH COUNTY
final_list = lapply(mid_list,monthly_agg)
return(final_list)
}
When assembling final datasets, note that certain pollutant standards are used bc they have values for AQI… the ones I used were:
Lead: Lead 3-Month 2009 ?? Has all NAs for AQI
CO: CO 8-hour 1971
SO2: SO2 1-hour 2010
NO2: NO2 1-hour 2010
O3: Ozone 8-hour 2015 ; sample duration should be 8 HR
PM10: PM10 24-hour 2006
PM2.5: PM25 24-hour 2012
APPLY RAW TRANSFORM FUNCTION TO ALL POLLUTANTS FOR ALL YEARS (BELOW SHOWS IT BEING DONE FOR 2019)
#Load raw data
pollutants1_2019 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants1_2019_8.18.RData")
pollutants2_2019 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Raw/pollutants2_2019_8.18.RData")
Lead2019 = pollutants1_2019 %>% filter(parameter_code == "14129")
CO2019 = pollutants1_2019 %>% filter(parameter_code == "42101")
SO22019 = pollutants1_2019 %>% filter(parameter_code == "42401")
NO22019 = pollutants1_2019 %>% filter(parameter_code == "42602")
O32019 = pollutants2_2019 %>% filter(parameter_code == "44201")
PM102019 = pollutants2_2019 %>% filter(parameter_code == "81102")
PM2.52019 = pollutants2_2019 %>% filter(parameter_code == "88101")
Lead2019_final = raw_transform(rawdata = Lead2019,reference_list = Lead_stations[[6]],standard = "Lead 3-Month 2009")
CO2019_final = raw_transform(rawdata = CO2019,reference_list = CO_stations[[6]],standard = "CO 8-hour 1971")
SO22019_final = raw_transform(rawdata = SO22019,reference_list = SO2_stations[[6]],standard = "SO2 1-hour 2010")
NO22019_final = raw_transform(rawdata = NO22019,reference_list = NO2_stations[[6]],standard = "NO2 1-hour 2010")
O32019_final = raw_transform(rawdata = O32019,reference_list = O3_stations[[6]],standard = "Ozone 8-hour 2015")
PM102019_final = raw_transform(rawdata = PM102019,reference_list = PM10_stations[[6]],standard = "PM10 24-hour 2006")
PM2.52019_final = raw_transform(rawdata = PM2.52019,reference_list = PM2.5_stations[[6]],standard = "PM25 24-hour 2012")
COMBINING EACH POLLUTANTS DATASET INTO A SINGLE DATAFRAME FOR THE YEAR (BELOW SHOWS IT BEING DONE FOR 2019)
###Combine final data into one dataframe for 2014
test1 = as.data.frame(do.call(rbind, Lead2019_final))
test1 = cbind(test1,rep(pollutants$parametercodes.code[1],nrow(test1))) #maybe change parameter codes to 1-7?
colnames(test1) = c("Year-Month","Value","AQI","Pollutant")
test2 = as.data.frame(do.call(rbind, CO2019_final))
test2 = cbind(test2,rep(pollutants$parametercodes.code[2],nrow(test2))) #maybe change parameter codes to 1-7?
colnames(test2) = c("Year-Month","Value","AQI","Pollutant")
test3 = as.data.frame(do.call(rbind, SO22019_final))
test3 = cbind(test3,rep(pollutants$parametercodes.code[3],nrow(test3))) #maybe change parameter codes to 1-7?
colnames(test3) = c("Year-Month","Value","AQI","Pollutant")
test4 = as.data.frame(do.call(rbind, NO22019_final))
test4 = cbind(test4,rep(pollutants$parametercodes.code[4],nrow(test4))) #maybe change parameter codes to 1-7?
colnames(test4) = c("Year-Month","Value","AQI","Pollutant")
test5 = as.data.frame(do.call(rbind, O32019_final))
test5 = cbind(test5,rep(pollutants$parametercodes.code[5],nrow(test5))) #maybe change parameter codes to 1-7?
colnames(test5) = c("Year-Month","Value","AQI","Pollutant")
test6 = as.data.frame(do.call(rbind, PM102019_final))
test6 = cbind(test6,rep(pollutants$parametercodes.code[6],nrow(test6))) #maybe change parameter codes to 1-7?
colnames(test6) = c("Year-Month","Value","AQI","Pollutant")
test7 = as.data.frame(do.call(rbind, PM2.52019_final))
test7 = cbind(test7,rep(pollutants$parametercodes.code[7],nrow(test7))) #maybe change parameter codes to 1-7?
colnames(test7) = c("Year-Month","Value","AQI","Pollutant")
#Combine each pollutant dataset into one big dataset for the year
final_data19 = rbind(test1,test2,test3,test4,test5,test6,test7)
###SAVE FINAL DATASET LOCALLY
saveRDS(final_data19,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")
HAVE TO CLEAN DATA BEFORE FINALIZING:
Having combined the monthly medians for every county for a single year for each pollutant into a single dataframe, one final cleaning step must be performed before putting each years’ data together. The AQI value that is in each row corresponds to the AQI standardized measurement for that specific pollutant. Each pollutant has a different standardizing equation, but once they are all standardized as they are in the dataset and they can be compared against each other. The reported AQI measurement for a given day is just the maximum of the AQI values corresponding to each of the 7 pollutants. So the maximum AQI value (among 6 values bc Lead observations never have AQI values) was found for each month and that value was set as the actual AQI value for that month in all corresponding rows.
FIND MAX AQI (AMONG THE 7 POLLUTANTS) FOR EACH MONTH -> SET AS ACTUAL AQI FOR THAT MONTH
(BELOW SHOWS IT BEING DONE FOR 2019)
#Do for each year
months = c("01","02","03","04","05","06","07","08","09","10","11","12")
###Do for each year
for (i in 1:58){
idx1 = which(stringr::str_starts(rownames(final_data19), counties[i]))
subset1 = final_data19[idx1,]
subset1$`Year-Month`= as.Date(subset1$`Year-Month`)
for (j in months){
#Filter by county and date
date = paste0("2019-",j,"-01")
date = as.Date(date)
subset2 = subset1 %>% filter(`Year-Month` == as.Date(date))
trueAQI = max(na.omit(subset2$AQI))
idx2 = which(subset1$`Year-Month` == date)
subset1$AQI[idx2] = trueAQI
}
final_data19[idx1,] = subset1
}
###SAVE FINAL DATASET LOCALLY
saveRDS(final_data19,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")
COMBINING EACH YEARS DATASET INTO ONE BIG TIDY DATAFRAME FOR AIR QUALITY COVARIATES
FILL IN YOUR OWN FILE DIRECTORIES HERE! START WORKING W ACTUAL EPA DATA FROM HERE ON
final_data14 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data14_9.1.RData")
final_data15 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data15_9.1.RData")
final_data16 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data16_9.1.RData")
final_data17 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data17_9.1.RData")
final_data18 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data18_9.1.RData")
final_data19 = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Aggregated/final_data19_9.1.RData")
final_EPA_data = rbind(final_data14,final_data15,final_data16,final_data17,
final_data18,final_data19)
saveRDS(final_EPA_data,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_EPA_data_9.1.RData")
final_EPA_data = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_EPA_data_9.1.RData")
head(final_EPA_data)
## Year-Month Value AQI Pollutant
## Alameda.1 2014-01-01 0.0500 60 14129
## Alameda.2 2014-02-01 0.0520 28 14129
## Alameda.3 2014-03-01 0.0655 34 14129
## Alameda.4 2014-04-01 0.0360 40 14129
## Alameda.5 2014-05-01 0.0300 38 14129
## Alameda.6 2014-06-01 0.0230 37 14129
Once the EPA data is in the correct format (one big dataframe) at the county level, we now need to aggregate it to the cluster level according to the SKATER cluster labels from before. A population weighted mean was used to aggregate here as well
Cluster = rep(1,length(final_EPA_data))
final_EPA_agg_data = cbind(final_EPA_data,Cluster)
for (i in 1:58){
idx = which(stringr::str_starts(rownames(final_EPA_agg_data), counties[i]))
final_EPA_agg_data$Cluster[idx] = clusterlabels$Cluster[i]
}
Time = c(rep(c(1:12),58),rep(13:24,58),rep(25:36,58),rep(37:48,58),rep(49:60,58),rep(61:72,58))
Time = rep(Time,7)
final_EPA_agg_data = cbind(Time,final_EPA_agg_data)
AGGREGATE CLUSTER DATA AND COMBINE INTO ONE DATAFRAME
countypops = CA_data %>% filter(Year > 2013) %>% select(Total_Pop,County,Year) %>% unique()
countypops = cbind(countypops,Cluster = rep(clusterlabels$Cluster,each=6))
countypops$County = rep(counties,each=6)
temp_EPA_agg_data = data.frame(final_EPA_agg_data[1,-2])
num_clus = length(unique(clusterlabels$Cluster))
for (k in 1:num_clus){
EPA_clus_k = data.frame(final_EPA_agg_data[1,-2])
for (i in pollutants$parametercodes.code){
pollutant_data = final_EPA_agg_data %>% filter(Pollutant == i)
cluster_data = pollutant_data %>% filter(Cluster == k)
cluster_data$Value = scale(cluster_data$Value)
cluster_data$AQI = scale(cluster_data$AQI)
year = 2014
for(j in 1:72){
cluster_data_j = cluster_data %>% filter(Time == j)
cluster_counties = countypops %>% filter(Cluster == k,Year == year)
pops = countypops %>% filter(Year == year,Cluster == k) %>% select(Total_Pop)
cluster.pop = sum(pops)
cluster.popweights = pops/cluster.pop
value_wmean = weighted.mean(cluster_data_j$Value,cluster.popweights$Total_Pop)
aqi_wmean = weighted.mean(cluster_data_j$AQI,cluster.popweights$Total_Pop)
insert = data.frame(Time = j,value_wmean,aqi_wmean,
Pollutant = i,Cluster = k)
colnames(insert) = colnames(EPA_clus_k)
EPA_clus_k = rbind(EPA_clus_k,insert)
if ((j>12) & (j<25)){
year = 2015
}
else if ((j>24) & (j<37)){
year = 2016
}
else if ((j>36) & (j<49)){
year = 2017
}
else if ((j>48) & (j<61)){
year = 2018
}
else if ((j>60) & (j<73)){
year = 2019
}
else{
year = 2014
}
}
}
EPA_clus_k = EPA_clus_k[-1,]
trueAQI = EPA_clus_k$AQI[1:72]
trueAQI = rep(trueAQI,7)
EPA_clus_k$AQI = trueAQI
rownames(EPA_clus_k) = NULL
temp_EPA_agg_data = rbind(temp_EPA_agg_data,EPA_clus_k)
}
final_EPA_agg_data = temp_EPA_agg_data[-1,]
saveRDS(final_EPA_agg_data,file = "C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_8.1_EPA_agg_data_10.26.RData")
final_EPA_agg_data = readRDS("C:/Users/jeffr/Desktop/Spatiotemporal + Causal Inference/Wildfire Paper 1 Code/EPA data/Final/final_8.1_EPA_agg_data_10.26.RData")
head(final_EPA_agg_data)
## Time Value AQI Pollutant Cluster
## 1 1 3.4877288 2.55384131 14129 1
## 2 2 3.7084663 -1.30105429 14129 1
## 3 3 5.1984443 -0.50143403 14129 1
## 4 4 1.9425665 0.21232654 14129 1
## 5 5 1.2803540 -0.05020462 14129 1
## 6 6 0.5077729 -0.31519038 14129 1
library(astsa)
for (i in pollutants$parametercodes.code){
for (j in 1:num_clus){
EPA_clus_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == j)
title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
tsplot(EPA_clus_ts$Value,main = title,xlab = "Time (months)",ylab = "Value")
}
}
It turns out that the time series for Lead was not stationary, so we need to detrend the data before using it
for (i in pollutants$parametercodes.code){
for (j in 1:num_clus){
EPA_clus_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == j)
lead_idx = which(final_EPA_agg_data$Pollutant == "14129" & final_EPA_agg_data$Cluster == j)
if (i == "14129"){
detrended1 = c(0,diff(EPA_clus_ts$Value,lag=1))
final_EPA_agg_data[lead_idx,2] = detrended1
title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
tsplot(detrended1,main = title,xlab = "Time (months)",ylab = expression(nabla~Value))
} else{
title = sprintf("Pollutant %s - Cluster %1.0f",i,j)
tsplot(EPA_clus_ts$Value,main = title,xlab = "Time (months)",ylab = "Value")
}
}
}
for (k in 1:num_clus){
AQI_clus_ts = final_EPA_agg_data %>% filter(Pollutant == "14129") %>% filter(Cluster == k)
title = sprintf("AQI - Cluster %1.0f",k)
tsplot(AQI_clus_ts$AQI,main = title,xlab = "Time (months)",ylab = "Value")
}
The kernel structures presented below reflect the findings of Chen et al. (2019) that distributed lags and interaction effects should be considered when evaluating the effects of air pollutants. Additionally, we wanted these lagged and joint effects to also vary in time. So, instead of calculating true autoregressive (AR), distributed lag (DL), and interaction structures, which assume stationarity, we construct localized kernels which have dynamic localized conditional correlations through time.
EPA_cluster_list = list()
cluster_EPA_data = function(cluster){
EPA_clus_ts = matrix(nrow=72)
for (i in pollutants$parametercodes.code){
covariate_ts = final_EPA_agg_data %>% filter(Pollutant == i) %>% filter(Cluster == cluster)
EPA_clus_ts = cbind(EPA_clus_ts,covariate_ts$Value)
}
EPA_clus_ts[,1] = covariate_ts$AQI
colnames(EPA_clus_ts) = c("AQI","Lead","CO","SO2","NO2","O3","PM10","PM2.5")
return(EPA_clus_ts)
}
for (i in 1:num_clus){
EPA_cluster_list[[i]] = cluster_EPA_data(i)
}
The first step is to decompose our covariate time series into its trend, seasonal, and residual components because we want to learn each kernel on a different component.
decompose_ts = function(EPA_ts){
S_scaled = EPA_ts
colnames(S_scaled) = c("AQI","Lead","CO","SO2","NO2","O3","PM10","PM2.5")
S_scaled_ts = ts(S_scaled, frequency = 12)
# S_decomposed = decompose(S_scaled_ts)
ts_decomposed <- lapply(colnames(S_scaled_ts), function(x) {stl(S_scaled_ts[, x], s.window = "periodic")})
names(ts_decomposed) = colnames(S_scaled_ts)
S_seasonal = ts_decomposed[[1]]$time.series[,1]
S_trend = ts_decomposed[[1]]$time.series[,2]
S_random = ts_decomposed[[1]]$time.series[,3]
for (i in 2:8){
S_seasonal = cbind(S_seasonal,ts_decomposed[[i]]$time.series[,1])
S_trend = cbind(S_trend,ts_decomposed[[i]]$time.series[,2])
S_random = cbind(S_random,ts_decomposed[[i]]$time.series[,3])
}
colnames(S_trend)= colnames(S_scaled_ts)
colnames(S_seasonal)= colnames(S_scaled_ts)
colnames(S_random)= colnames(S_scaled_ts)
S_DL = S_seasonal + S_random
colnames(S_DL)= colnames(S_scaled_ts)
S_random_int = S_random[12:72,]
S_random = S_random[13:72,]
S_seasonal = S_seasonal[13:72,]
S_trend = S_trend[13:72,]
S_DL = S_DL[13:72,]
S_DL = data.frame(S_DL)
S_DL2 = matrix(nrow=60)
dl = c(3,6,12)
col_num = 2
for (i in dl){
for (j in 1:ncol(S_DL)){
extract = S_DL[(72-59-i):(72-i),j]
S_DL2 = cbind(S_DL2,extract)
colnames(S_DL2)[col_num] = sprintf("B%1.0f-%s",i,colnames(S_DL)[j])
col_num = col_num+1
}
}
S_DL2 = S_DL2[,-1]
S_DL_final = cbind(S_DL[13:72,],S_DL2)
W = matrix(nrow=(nrow(S_random))^2)
num_cols = ncol(S_random)
# num_cols = ncol(S_DL_final) #for now, just calculate interaction pairs for actual covariates
col_num = 2
for (i in 1:num_cols){
for (j in 1:num_cols){
interaction_col = kronecker(S_random[,i],S_random[,j]) #replace S_random with S_DL_final for DL interactions
W = cbind(W,interaction_col)
colnames(W)[col_num] = sprintf("%sx%s",colnames(S_scaled)[i],colnames(S_scaled)[j])
col_num = col_num+1
}
}
W = W[,-1]
row1 = c()
for (k in 1:ncol(S_random_int)){
row1 = c(row1,S_random_int[2,k]*S_random_int[1,])
}
W2 = rbind(as.numeric(row1),W)
W2 = W2[,-seq(1,64,by=9)] #need to change if we include DL interactions
list = list(S_random,S_random_int,S_seasonal,S_DL,
S_DL2,S_DL_final,S_trend,W2)
names(list) = c("S_random","S_random_int","S_seasonal","S_DL",
"S_DL2","S_DL_final","S_trend","W2")
return(list)
}
decompose_clus1 = decompose_ts(EPA_cluster_list[[1]])
decompose_clus2 = decompose_ts(EPA_cluster_list[[2]])
decompose_clus3 = decompose_ts(EPA_cluster_list[[3]])
decompose_clus4 = decompose_ts(EPA_cluster_list[[4]])
decompose_clus5 = decompose_ts(EPA_cluster_list[[5]])
decompose_clus6 = decompose_ts(EPA_cluster_list[[6]])
decompose_clus7 = decompose_ts(EPA_cluster_list[[7]])
decomposed_cluster_data = list(decompose_clus1,decompose_clus2,decompose_clus3,
decompose_clus4,decompose_clus5,decompose_clus6,
decompose_clus7)
S_random_all = cbind(decompose_clus1$S_random,decompose_clus2$S_random,
decompose_clus3$S_random,decompose_clus4$S_random,
decompose_clus5$S_random,decompose_clus6$S_random,
decompose_clus7$S_random)
S_DL_all = cbind(cbind(decompose_clus1$S_DL,decompose_clus2$S_DL,
decompose_clus3$S_DL,decompose_clus4$S_DL,
decompose_clus5$S_DL,decompose_clus6$S_DL,
decompose_clus7$S_DL))
W2_all = cbind(cbind(decompose_clus1$W2,decompose_clus2$W2,
decompose_clus3$W2,decompose_clus4$W2,
decompose_clus5$W2,decompose_clus6$W2,
decompose_clus7$W2))
Using the residual component of each time series, we construct a kernel that calculates the autocorrelation at one lag for all time points
Linear time invariant approach:
Let \(\sigma^2_{AR} = \gamma(0)\) be the variance of one of our time series, we can find ACVF and ACF from Yule-Walker as
\(\gamma(k) = a_1 \gamma(k-1) + a_2 \gamma(k-2) + ... + a_p \gamma(k-p)\)
\(\rho(k) = \frac{\gamma(k)}{\gamma(0)} = a_1 \rho(k-1) + a_2 \rho(k-2) + ... + a_p \rho(k-p)\)
AR_invariant_list = list()
for (c in 1:num_clus){
#Grab S_random data for cluster c
cluster_data = decomposed_cluster_data[[c]]
S_random_clus = cluster_data$S_random
ar.corr.values = c()
ar.cov.values = c()
for (i in 1:ncol(S_random_clus)){
var = var(S_random_clus[,i])
fit.ar = ar(S_random_clus[,i],order.max = 1, aic = FALSE, method = "yule-walker")
corr.ar1 = fit.ar$ar
cov.ar1 = fit.ar$ar * var
ar.corr.values = c(ar.corr.values,corr.ar1)
ar.cov.values = c(ar.cov.values,cov.ar1)
}
for (j in 1:ncol(S_random_clus)){
AR_invariant_covmatrix = diag(nrow(S_random_clus))
AR_invariant_covmatrix[row(AR_invariant_covmatrix) == col(AR_invariant_covmatrix) - 1] = ar.cov.values[j]
AR_invariant_covmatrix[row(AR_invariant_covmatrix) == col(AR_invariant_covmatrix) + 1] = ar.cov.values[j]
}
AR_invariant = diag(nrow(S_random_clus))
AR_invariant[row(AR_invariant) == col(AR_invariant) - 1] = (1/length(ar.cov.values))*sum(ar.cov.values)
AR_invariant[row(AR_invariant) == col(AR_invariant) + 1] = (1/length(ar.cov.values))*sum(ar.cov.values)
# corrplot(AR_invariant, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Invariant AR 1 Covariance Structure")
matrix_heatmap(AR_invariant,title = "Invariant AR 1 Covariance Structure")
AR_invariant_list[[c]] = AR_invariant
}
K_AR_invariant = matrix(0,nrow=60,ncol=60)
for(i in 1:num_clus){
K_AR_invariant = K_AR_invariant + ((1/num_clus)*AR_invariant_list[[i]])
}
# corrplot(K_AR_invariant, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Invariant AR 1 Covariance Structure")
matrix_heatmap(K_AR_invariant,title = "Invariant AR 1 Covariance Structure")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
K_AR_cluster = list()
K_AR_periodic_cluster = list()
for (c in 1:num_clus){
#Grab S_random data for cluster c
cluster_data = decomposed_cluster_data[[c]]
S_random_clus = cluster_data$S_random
#Create a list to contain covariance matrix for each pollutant (8)
K_AR_list = list()
K_AR_periodic_list = list()
rho_AR = 1
sigma2_AR = 1
time_span = nrow(S_random_clus)
#Calculate a AR 1 covariance matrix for each pollutant and store in list
for (i in 1:8){
ts = S_random_clus[,i]
K_covariate = matrix(nrow=time_span,ncol=time_span)
K_covariate_periodic = matrix(nrow=time_span,ncol=time_span)
for(j in 1:time_span){
for (k in 1:time_span){
if (abs(j-k) <= 1){
K_covariate[j,k] = exp(- ((ts[j] - ts[k])^2) #RBF kernel
/ (2*rho_AR)) * sigma2_AR
K_covariate_periodic[j,k] = exp(- ((ts[j] - ts[k])^2) #Locally periodic kernel
/ (2*rho_AR)) * exp(- (2*sin((abs(ts[j] - ts[k]))*pi/12)^2)
/ (rho_AR)) * sigma2_AR
}
else{
K_covariate_periodic[j,k] = 0
K_covariate[j,k] = 0
}
}
}
K_AR_list[[i]] = K_covariate
K_AR_periodic_list[[i]] = K_covariate_periodic
}
names(K_AR_list) = colnames(S_random_clus)
names(K_AR_periodic_list) = colnames(S_random_clus)
#Add each pollutant's covariance matrix to get AR 1 matrix for each cluster
K_AR = matrix(0,nrow=60,ncol=60)
K_AR_periodic = matrix(0,nrow=60,ncol=60)
K_AR_periodic_weights = rep(1,length(K_AR_periodic_list))
for(i in 1:length(K_AR_periodic_list)){
K_AR = K_AR + ((1/8)*K_AR_list[[i]])
K_AR_periodic = K_AR_periodic + ((1/8)*K_AR_periodic_list[[i]])
K_AR_periodic_weights[i] = norm(K_AR_periodic_list[[i]],type = "F")
}
K_AR_periodic_weights = K_AR_periodic_weights / sum(K_AR_periodic_weights)
print(K_AR_periodic_weights)
K_AR_cluster[[c]] = K_AR
K_AR_periodic_cluster[[c]] = K_AR_periodic
}
## [1] 0.1182129 0.1258859 0.1279207 0.1202214 0.1291226 0.1320892 0.1254036
## [8] 0.1211437
## [1] 0.1180238 0.1257031 0.1279176 0.1208326 0.1290610 0.1318444 0.1254549
## [8] 0.1211627
## [1] 0.1177953 0.1260674 0.1281143 0.1206671 0.1292772 0.1321721 0.1251538
## [8] 0.1207528
## [1] 0.1179131 0.1258756 0.1279283 0.1207317 0.1295156 0.1320599 0.1251074
## [8] 0.1208684
## [1] 0.1178528 0.1258298 0.1278816 0.1206277 0.1293356 0.1320934 0.1257028
## [8] 0.1206765
## [1] 0.1183861 0.1257906 0.1279683 0.1210789 0.1290587 0.1317483 0.1247048
## [8] 0.1212643
## [1] 0.1183055 0.1254444 0.1268239 0.1206665 0.1289935 0.1315729 0.1272763
## [8] 0.1209169
for (i in 1:num_clus){
title1 = sprintf("AR 1 Covariance for Cluster %s",i)
title2 = sprintf("Periodic AR 1 Covariance for Cluster %s",i)
# corrplot(K_AR_cluster[[i]], order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = title1)
# corrplot(K_AR_periodic_cluster[[i]], order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = title2)
matrix_heatmap(K_AR_cluster[[i]],title = title1)
matrix_heatmap(K_AR_periodic_cluster[[i]],title = title2)
}
Combine each cluster’s AR 1 kernel together:
K_AR = matrix(0,nrow=60,ncol=60)
K_AR_periodic = matrix(0,nrow=60,ncol=60)
K_AR_periodic_weights = rep(1,num_clus)
for(i in 1:num_clus){
K_AR = K_AR + ((1/num_clus)*K_AR_cluster[[i]])
K_AR_periodic = K_AR_periodic + ((1/num_clus)*K_AR_periodic_cluster[[i]])
K_AR_periodic_weights[i] = norm(K_AR_cluster[[i]],type = "F")
}
K_AR_periodic_weights = K_AR_periodic_weights / sum(K_AR_periodic_weights)
print(K_AR_periodic_weights)
## [1] 0.1428045 0.1429864 0.1425446 0.1426904 0.1428154 0.1428429 0.1433158
#Heatmap of resulting K
# corrplot(K_AR, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "AR 1 Covariance Structure")
# corrplot(K_AR_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Periodic AR 1 Covariance Structure")
matrix_heatmap(K_AR,title = "AR 1 Covariance Structure")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
matrix_heatmap(K_AR_periodic,title = "Periodic AR 1 Covariance Structure")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
Here, we wish to account for lagged effects that we believe are significant (3,6, and 12 months). Note that we add the seasonal component to the residual component of our covariate time series decomposition to get our input for this kernel.
DL_invariant_list = list()
for (c in 1:num_clus){
#Grab S_DL data for cluster c
cluster_data = decomposed_cluster_data[[c]]
S_DL_clus = cluster_data$S_DL
dl3.corr.values = c()
dl3.cov.values = c()
dl6.corr.values = c()
dl6.cov.values = c()
dl12.corr.values = c()
dl12.cov.values = c()
for (i in 1:ncol(S_DL_clus)){
var = var(S_DL_clus[,i])
fit.ar = ar(S_DL_clus[,i],order.max = 1, aic = FALSE, method = "yule-walker")
#Fit a parametric AR model for each lag
dl3 = arima(S_DL_clus[,i],order = c(3,0,0),seasonal = c(0,0,0),include.mean = FALSE,fixed = c(0,0,NA))
dl6 = arima(S_DL_clus[,i],order = c(6,0,0),seasonal = c(0,0,0),include.mean = FALSE,fixed = c(0,0,0,0,0,NA))
dl12 = arima(S_DL_clus[,i],order = c(12,0,0),seasonal = c(0,0,0),include.mean = FALSE,
fixed = c(0,0,0,0,0,0,0,0,0,0,0,NA))
#Calculate correlations and covariances from coefficient estimates
corr.dl3 = as.numeric(dl3$coef[3])
cov.dl3 = as.numeric(dl3$coef[3]) * var
corr.dl6 = as.numeric(dl6$coef[6])
cov.dl6 = as.numeric(dl6$coef[6]) * var
corr.dl12 = as.numeric(dl12$coef[12])
cov.dl12 = as.numeric(dl12$coef[12]) * var
dl3.corr.values = c(dl3.corr.values,corr.dl3)
dl3.cov.values = c(dl3.cov.values,cov.dl3)
dl6.corr.values = c(dl6.corr.values,corr.dl6)
dl6.cov.values = c(dl6.cov.values,cov.dl6)
dl12.corr.values = c(dl12.corr.values,corr.dl12)
dl12.cov.values = c(dl12.cov.values,cov.dl12)
}
# Run if you want to create a DL_invariant matrix for each pollutant
# for (j in 1:ncol(S_DL_clus)){
# DL_invariant_covmatrix = diag(nrow(S_DL_clus))
#
# DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 3] = dl3.cov.values[j]
# DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 3] = dl3.cov.values[j]
#
# DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 6] = dl6.cov.values[j]
# DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 6] = dl6.cov.values[j]
#
# DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 12] = dl12.cov.values[j]
# DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 12] = dl12.cov.values[j]
#
# # title = sprintf("Covariance of %s",colnames(S_DL_all)[j])
# # corrplot(DL_invariant_covmatrix, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# # title = title)
# }
DL_invariant_covmatrix = diag(nrow(S_DL_clus))
DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 3] = sum(dl3.cov.values)*(1/8)
DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 3] = sum(dl3.cov.values)*(1/8)
DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 6] = sum(dl6.cov.values)*(1/8)
DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 6] = sum(dl6.cov.values)*(1/8)
DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) - 12] = sum(dl12.cov.values)*(1/8)
DL_invariant_covmatrix[row(DL_invariant_covmatrix) == col(DL_invariant_covmatrix) + 12] = sum(dl12.cov.values)*(1/8)
DL_invariant_list[[c]] = DL_invariant_covmatrix
# corrplot(DL_invariant_covmatrix, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Invariant DL (3,6,12) Covariance Structure")
matrix_heatmap(DL_invariant_covmatrix,title = "Invariant DL (3,6,12) Covariance Structure")
}
#Combine DL covariance matrices from each cluster together
K_DL_invariant = matrix(0,nrow=60,ncol=60)
for(i in 1:num_clus){
K_DL_invariant = K_DL_invariant + ((1/num_clus)*DL_invariant_list[[i]])
}
# corrplot(K_DL_invariant, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Invariant DL (3,6,12) Covariance Structure")
matrix_heatmap(K_DL_invariant,title = "Invariant DL (3,6,12) Covariance Structure")
K_DL_cluster = list()
K_DL_periodic_cluster = list()
for (c in 1:num_clus){
#Grab S_DL data for cluster c
cluster_data = decomposed_cluster_data[[c]]
S_DL_clus = cluster_data$S_DL
#Create a list to store covariance matrix for each DL
K_DL_list = list()
K_DL_periodic_list = list()
dl_lags = c(3,6,12)
tracker = 1
for (i in dl_lags){
K_DL = matrix(nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(nrow=time_span,ncol=time_span)
rho_DL = 1
sigma2_DL = 1
#Calculate DL covariance matrix for specified lag
for(j in 1:nrow(S_DL_clus)){
for (k in 1:nrow(S_DL_clus)){
if ((abs(j-k) == 0) || (abs(j-k) == i)){
K_DL[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2) / (2*rho_DL)) * sigma2_DL
K_DL_periodic[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2)
/ (2*rho_DL)) * exp(- (2*sin(sum(abs(S_DL_clus[j,] - S_DL_clus[k,]))*pi/12)^2)
/ (rho_DL)) * sigma2_DL
}
else{
K_DL_periodic[j,k] = 0
K_DL[j,k] = 0
}
}
}
K_DL_list[[tracker]] = K_DL
K_DL_periodic_list[[tracker]] = K_DL_periodic
tracker = tracker+1
}
#Combine the 3 DL covariance matrices together
K_DL = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic_weights = rep(1,length(K_DL_periodic_list))
for(i in 1:length(K_DL_periodic_list)){
K_DL = K_DL + ((1/3)*K_DL_list[[i]])
K_DL_periodic = K_DL_periodic + ((1/3)*K_DL_periodic_list[[i]])
K_DL_periodic_weights[i] = norm(K_DL_periodic_list[[i]],type = "F")
}
K_DL_periodic_weights = K_DL_periodic_weights / sum(K_DL_periodic_weights)
print(K_DL_periodic_weights)
#Store DL(3,6,12) covariance matrix for each cluster
K_DL_cluster[[c]] = K_DL
K_DL_periodic_cluster[[c]] = K_DL
}
## [1] 0.3309092 0.3280791 0.3410116
## [1] 0.3299808 0.3265991 0.3434201
## [1] 0.3332712 0.3269659 0.3397629
## [1] 0.3318032 0.3263415 0.3418554
## [1] 0.3317627 0.3274553 0.3407819
## [1] 0.3299765 0.3262256 0.3437979
## [1] 0.3316097 0.3263056 0.3420847
Combining DL kernels for each cluster together:
K_DL = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic_weights = rep(1,num_clus)
for(i in 1:num_clus){
K_DL = K_DL + ((1/num_clus)*K_DL_cluster[[i]])
K_DL_periodic = K_DL_periodic + ((1/num_clus)*K_DL_periodic_cluster[[i]])
K_DL_periodic_weights[i] = norm(K_DL_periodic_cluster[[i]],type = "F")
}
K_DL_periodic_weights = K_DL_periodic_weights / sum(K_DL_periodic_weights)
print(K_DL_periodic_weights)
## [1] 0.1425824 0.1430462 0.1431990 0.1431142 0.1426814 0.1430127 0.1423643
#Heatmap of resulting K
# corrplot(K_DL, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "DL (3,6,12) Covariance Structure")
# corrplot(K_DL_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Periodic DL (3,6,12) Covariance Structure")
matrix_heatmap(K_DL,title = "DL (3,6,12) Covariance Structure")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
matrix_heatmap(K_DL_periodic,title = "Periodic DL (3,6,12) Covariance Structure")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
Finally, we want to include the two way interaction structures for every pair of EPA covariate time series. It is reasonable to think that the value of an air pollutant covariate at one time point may affect the value of another air pollutant at the same or even future time point. Interaction pairs are calculated by performing the kronecker product on two of time series vectors.
K_Interaction_cluster = list()
K_Interaction_periodic_cluster = list()
for (c in 1:num_clus){
#Grab interaction pair data for cluster c
cluster_data = decomposed_cluster_data[[c]]
W2_clus = cluster_data$W2
K_interaction_list = list()
K_interaction_periodic_list = list()
column_names = colnames(W2_clus)
time_span = nrow(W2_clus)
#Create sequence of indices corresponding to comparisons for real time and one lag interaction effects
lag0_idx = seq(2,3601,by=61)
lag1_idx = seq(1,3600,by=61)
#Calculate a kernel for each interaction pair
for (a in 1:length(column_names)){
interaction = W2_clus[,a]
#First calculate these two interaction kernels separately
K_int0 = matrix(nrow = 60,ncol = 60)
K_int1 = matrix(nrow = 60,ncol = 60)
K_int0_periodic = matrix(nrow = 60,ncol = 60)
K_int1_periodic = matrix(nrow = 60,ncol = 60)
rho_int = 1
sigma2_int = 1
for (i in 1:60){
for (j in 1:60){
#RBF kernels
K_int0[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
/ (2*rho_int)) * sigma2_int
K_int1[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
/ (2*rho_int)) * sigma2_int
#Locally periodic kernels
K_int0_periodic[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
/ (2*rho_int)) * exp(- (2*sin((abs(interaction[lag0_idx[i]] - interaction[lag0_idx[j]]))*pi/12)^2)
/ (rho_int)) * sigma2_int
K_int1_periodic[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
/ (2*rho_int)) * exp(- (2*sin((abs(interaction[lag1_idx[i]] - interaction[lag1_idx[j]]))*pi/12)^2)
/ (rho_int)) * sigma2_int
}
}
#Combine real time and one lag interaction kernels together
K_interaction = 0.5*K_int0 + 0.5*K_int1
K_interaction_list[[a]] = K_interaction
K_interaction_periodic = 0.5*K_int0_periodic + 0.5*K_int1_periodic
K_interaction_periodic_list[[a]] = K_interaction_periodic
}
#Combine kernels for each interaction pair together
K_interaction = matrix(0,nrow=60,ncol=60)
K_interaction_periodic = matrix(0,nrow=60,ncol=60)
K_interaction_periodic_weights = rep(1,length(K_interaction_periodic_list))
for(i in 1:length(K_interaction_periodic_list)){
K_interaction = K_interaction + ((1/length(K_interaction_list))*K_interaction_list[[i]])
K_interaction_periodic = K_interaction_periodic + ((1/length(K_interaction_periodic_list))*K_interaction_periodic_list[[i]])
K_interaction_periodic_weights[i] = norm(K_interaction_periodic_list[[i]],type = "F")
}
K_interaction_periodic_weights = K_interaction_periodic_weights / sum(K_interaction_periodic_weights)
print(K_interaction_periodic_weights)
# corrplot(K_interaction, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Interaction Covariance Structure")
# corrplot(K_interaction_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Periodic Interaction Covariance Structure")
matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")
matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")
#Store final interaction kernel (for all pairs) for each cluster
K_Interaction_cluster[[c]] = K_interaction
K_Interaction_periodic_cluster[[c]] = K_interaction_periodic
}
## [1] 0.01801276 0.01703143 0.01694759 0.01778825 0.01788158 0.01721644
## [7] 0.01642633 0.01788767 0.01844540 0.01811813 0.01863162 0.01880566
## [13] 0.01860215 0.01797847 0.01715008 0.01856778 0.01795322 0.01827947
## [19] 0.01870554 0.01812662 0.01744781 0.01661799 0.01798394 0.01763119
## [25] 0.01778816 0.01815186 0.01738221 0.01692601 0.01747119 0.01877287
## [31] 0.01841553 0.01797934 0.01874718 0.01787089 0.01766895 0.01762439
## [37] 0.01888591 0.01836894 0.01820732 0.01854172 0.01825139 0.01778447
## [43] 0.01684561 0.01862202 0.01808052 0.01764450 0.01849106 0.01829118
## [49] 0.01715437 0.01607265 0.01813384 0.01722352 0.01722589 0.01781774
## [55] 0.01792503 0.01739660
## [1] 0.01799398 0.01709268 0.01706268 0.01772355 0.01782421 0.01716357
## [7] 0.01645371 0.01787718 0.01840045 0.01814090 0.01858716 0.01873067
## [13] 0.01855974 0.01795085 0.01713023 0.01852411 0.01792645 0.01821370
## [19] 0.01863021 0.01806931 0.01739713 0.01679001 0.01802977 0.01771089
## [25] 0.01784844 0.01823677 0.01752284 0.01708067 0.01751396 0.01872658
## [31] 0.01839206 0.01796957 0.01865719 0.01782524 0.01768153 0.01756624
## [37] 0.01881072 0.01830463 0.01822472 0.01844018 0.01808511 0.01767478
## [43] 0.01715139 0.01860286 0.01811352 0.01780259 0.01846529 0.01824598
## [49] 0.01734650 0.01610391 0.01809783 0.01723231 0.01735257 0.01775926
## [55] 0.01784222 0.01733938
## [1] 0.01797992 0.01699144 0.01697681 0.01777718 0.01784029 0.01701896
## [7] 0.01637073 0.01785271 0.01844724 0.01818964 0.01863244 0.01878839
## [13] 0.01858867 0.01795503 0.01716656 0.01855519 0.01795217 0.01825505
## [19] 0.01869061 0.01806611 0.01747365 0.01672483 0.01804956 0.01770563
## [25] 0.01792620 0.01829659 0.01756626 0.01704488 0.01747957 0.01876871
## [31] 0.01841102 0.01805965 0.01870195 0.01779189 0.01767880 0.01757292
## [37] 0.01886938 0.01834226 0.01827614 0.01847973 0.01812173 0.01772425
## [43] 0.01687882 0.01864848 0.01813857 0.01779204 0.01842348 0.01829411
## [49] 0.01720398 0.01599703 0.01810676 0.01721032 0.01731120 0.01778245
## [55] 0.01788553 0.01716650
## [1] 0.01798815 0.01707250 0.01694199 0.01782006 0.01785618 0.01704239
## [7] 0.01638232 0.01785643 0.01847610 0.01814356 0.01865467 0.01878815
## [13] 0.01861039 0.01794113 0.01707292 0.01860913 0.01798194 0.01830786
## [19] 0.01867758 0.01808722 0.01738208 0.01662694 0.01801765 0.01768447
## [25] 0.01789614 0.01822566 0.01744549 0.01696939 0.01749136 0.01878553
## [31] 0.01847234 0.01805161 0.01874162 0.01786479 0.01769166 0.01758662
## [37] 0.01886923 0.01835239 0.01820703 0.01853056 0.01816756 0.01775636
## [43] 0.01685123 0.01865585 0.01816681 0.01771010 0.01849356 0.01829750
## [49] 0.01716661 0.01599038 0.01810632 0.01725543 0.01723000 0.01782367
## [55] 0.01789775 0.01722766
## [1] 0.01796224 0.01701220 0.01692662 0.01778606 0.01785974 0.01725829
## [7] 0.01634749 0.01782948 0.01844827 0.01812726 0.01863357 0.01876688
## [13] 0.01856995 0.01790556 0.01702322 0.01857154 0.01794633 0.01828354
## [19] 0.01866044 0.01823927 0.01734590 0.01661930 0.01800915 0.01769305
## [25] 0.01787700 0.01826233 0.01752535 0.01697313 0.01745899 0.01876345
## [31] 0.01843720 0.01802069 0.01872466 0.01786430 0.01765832 0.01759966
## [37] 0.01884744 0.01833805 0.01824918 0.01850735 0.01824258 0.01776540
## [43] 0.01691454 0.01860446 0.01822114 0.01775709 0.01848905 0.01827943
## [49] 0.01720040 0.01597595 0.01806987 0.01720394 0.01723912 0.01779865
## [55] 0.01789749 0.01740841
## [1] 0.01803884 0.01713361 0.01713705 0.01777179 0.01784107 0.01705886
## [7] 0.01651129 0.01792731 0.01845711 0.01817844 0.01860422 0.01876322
## [13] 0.01854934 0.01799890 0.01723745 0.01857292 0.01798694 0.01826528
## [19] 0.01866658 0.01795032 0.01749327 0.01683844 0.01804676 0.01771537
## [25] 0.01786577 0.01818352 0.01735629 0.01708117 0.01753902 0.01875002
## [31] 0.01841447 0.01802720 0.01866245 0.01779154 0.01768402 0.01758671
## [37] 0.01884653 0.01831461 0.01819993 0.01846422 0.01800512 0.01768366
## [43] 0.01697388 0.01860367 0.01792730 0.01766821 0.01843175 0.01817202
## [49] 0.01717160 0.01615916 0.01813972 0.01726979 0.01736962 0.01780464
## [55] 0.01784823 0.01725975
## [1] 0.01793093 0.01703255 0.01699054 0.01774645 0.01778528 0.01741396
## [7] 0.01643922 0.01777190 0.01836883 0.01810608 0.01856923 0.01867281
## [13] 0.01857209 0.01786711 0.01691149 0.01852418 0.01780964 0.01816871
## [19] 0.01847682 0.01826081 0.01726716 0.01667661 0.01797483 0.01756242
## [25] 0.01790155 0.01825023 0.01781933 0.01708938 0.01739940 0.01869059
## [31] 0.01833717 0.01800166 0.01861137 0.01802058 0.01764887 0.01751726
## [37] 0.01875468 0.01815959 0.01814963 0.01840496 0.01829976 0.01772545
## [43] 0.01721479 0.01859167 0.01820298 0.01795298 0.01856058 0.01832354
## [49] 0.01750714 0.01607387 0.01805412 0.01725451 0.01733893 0.01776503
## [55] 0.01784731 0.01763145
Combining interaction kernels from each cluster together:
K_interaction = matrix(0,nrow=60,ncol=60)
K_interaction_periodic = matrix(0,nrow=60,ncol=60)
K_interaction_periodic_weights = rep(1,num_clus)
for(i in 1:num_clus){
K_interaction = K_interaction + ((1/length(K_Interaction_cluster))*K_Interaction_cluster[[i]])
K_interaction_periodic = K_interaction_periodic + ((1/length(K_Interaction_periodic_cluster))*K_Interaction_periodic_cluster[[i]])
K_interaction_periodic_weights[i] = norm(K_Interaction_periodic_cluster[[i]],type = "F")
}
K_interaction_periodic_weights = K_interaction_periodic_weights / sum(K_interaction_periodic_weights)
print(K_interaction_periodic_weights)
## [1] 0.1425086 0.1431007 0.1425407 0.1426358 0.1428563 0.1426961 0.1436618
# corrplot(K_interaction, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Interaction Covariance Structure")
# corrplot(K_interaction_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "Periodic Interaction Covariance Structure")
matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
F_norm = function(kernel){
total = sum(as.numeric(abs(kernel)^2))
Fnorm = sqrt(total)
return(Fnorm)
}
KTA_norm = function(kernel1,kernel2){
centered_k1 = t(diag(nrow(kernel1)) - (1/nrow(kernel1) * t(diag(nrow(kernel1))) %*% diag(nrow(kernel1)))) %*%
kernel1 %*% diag(nrow(kernel1)) - (1/nrow(kernel1) * t(diag(nrow(kernel1))) %*% diag(nrow(kernel1)))
centered_k2 = t(diag(nrow(kernel2)) - (1/nrow(kernel2) * t(diag(nrow(kernel2))) %*% diag(nrow(kernel2)))) %*%
kernel2 %*% diag(nrow(kernel2)) - (1/nrow(kernel2) * t(diag(nrow(kernel2))) %*% diag(nrow(kernel2)))
term1 = centered_k1 / (F_norm(centered_k1))
term2 = centered_k2 / (F_norm(centered_k2))
term3 = term1 - term2
term4 = F_norm(term3)
measure = 1 - 0.5*(term4^2)
return(measure)
}
KTA_rownames = c("AR Invariant-AR RBF", "AR Invariant-AR LP",
"AR RBF-AR LP ", "DL Invariant-DL RBF",
"DL Invariant-DL LP", "DL RBF-DL LP",
"Interaction RBF-Interaction LP")
KTA_table = matrix(c(KTA_norm(K_AR_invariant,K_AR),
KTA_norm(K_AR_invariant,K_AR_periodic),
KTA_norm(K_AR,K_AR_periodic),
KTA_norm(K_DL_invariant,K_DL),
KTA_norm(K_DL_invariant,K_DL_periodic),
KTA_norm(K_DL,K_DL_periodic),
KTA_norm(K_interaction,K_interaction_periodic)),
nrow=7)
KTA_table = data.frame(KTA_table)
rownames(KTA_table) = KTA_rownames
colnames(KTA_table) = "Centered KTA via inner product"
KTA_table
## Centered KTA via inner product
## AR Invariant-AR RBF 0.5993978
## AR Invariant-AR LP 0.6105226
## AR RBF-AR LP 0.9997569
## DL Invariant-DL RBF 0.7920310
## DL Invariant-DL LP 0.7920310
## DL RBF-DL LP 1.0000000
## Interaction RBF-Interaction LP 0.9999298
In the mortality dataset obtained from Cal-ViDa, all of the cells with small values i.e., less than 10 but not equal to 0, were censored. So in order to avoid using a truncated Poisson distribution, we decided to impute these censored values with an EM algorithm which is described below:
Due to interval censoring, we do not observe the exact mortality counts for some units. Let \(C_i\) be the censoring indicator such that \(C_i = I(1 \le Y_i \le 10)\), i.e., the count is censored if it is between 1 and 10. For the EM algorithm, we will be modeling the rate using a generalized linear model where we use the variables month (our time index), cause of death, and age group as predictors. So let us assume that \(\lambda_i(\beta) = \exp(\alpha_i + \textbf{X}_i^T \boldsymbol{\beta})\), where \(\alpha_i\) is the offset (log of county population of the age group) and \(\textbf{X}_i\) are predictors (month, cause of death, age group). Derivations for the EM algorithm can be found in appendix B of the paper.
We coded this algorithm as follows:
First, we needed to get an initial estimate of our \(\beta\) coefficients in our Poisson regression model. We included age group, county of death, cause of death (either influenza+pneumonia OR chronic lower respiratory disease), and month of death as covariates.
We need to perform an initial imputation to get a complete dataset to fit a model on. We decided to do this by making a crude estimation of the rate per 100,000 people \(\lambda_i\). To do this, we calculated a population weighted mean of the number of deaths across all ages and months. However, we only had observed populations at the county level, not for each specific age group included in the mortality data. So using census data which told us the approximate populations for specific age groups (for all of California), we were able to calculate approximate population sizes for each of the age categories included in the mortality data. See death_byage2 for reference. Then using the ratio of a given county’s population relative to the entire population of California, we were able to calculate approximations for the population size of each age group for each county in our mortality dataset. These served as the weights for our population weighted average of the rate of respiratory deaths in California.
population_age = read_xlsx("Population Categories.xlsx")
## New names:
## • `` -> `...2`
## • `` -> `...3`
## • `` -> `...4`
## • `` -> `...5`
## • `` -> `...6`
## • `` -> `...7`
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
## • `` -> `...12`
## • `` -> `...13`
#head(population_age)
population_age = population_age[-(1:5),2]
population_age$...2 = as.numeric(population_age$...2)
#split under 5 category into < 1 and 1-4 years old
less1 = floor(population_age$...2[1]*0.2)
onefour = floor(population_age$...2[1]*0.8)
death_byage = population_age$...2[-1]
death_byage = c(less1,onefour,death_byage)
death_byage = death_byage[1:19]
death_byage2 = death_byage[1:2]
idx = seq(from = 3, to = 17, by = 2)
for (i in idx){
death_byage2 = c(death_byage2,(death_byage[i]+death_byage[i+1]))
}
death_byage2 = c(death_byage2,death_byage[19])
age_groups = unique(mortality$Age)
death_byage2 = data.frame(cbind(age_groups,death_byage2))
colnames(death_byage2) = c("Age_Group","Population_by_Age")
death_byage2$Population_by_Age = as.numeric(death_byage2$Population_by_Age)
#head(death_byage2)
#2010-2019 population data for CA
USpops = read.csv("CA_census_pops1019.csv")
CApops = USpops %>% filter(STNAME == "California") %>% select(CTYNAME,POPESTIMATE2019)
counties = countycodes$value_represented #from EPA data file
weights = CApops[(2:59),2]
weights = weights/CApops[1,2]
groups = unique(mortality$Age)
step1 = 1
step2 = 1
for (i in counties){
for (j in groups){
idx = which(mortality$Age == j & mortality$County_of_Death == i)
mortality$Population[idx] = ceiling(death_byage2$Population[step1]*weights[step2])
step1 = step1+1
}
step1 = 1
step2 = step2+1
}
mortality$logpop = log(mortality$Population)
censored_idx = which(mortality$Total_Deaths == "<11")
censorTF = mortality$Total_Deaths == "<11"
mortality = cbind(mortality,censorTF)
#head(mortality)
GETTING INITIAL GUESS FOR LAMBDA: AVG DEATHS (PER 100K PEOPLE) PER MONTH FOR ONE COUNTY
uncensored_mortality = mortality %>% filter(censorTF == FALSE) %>% select(Total_Deaths,Population)
uncensored_mortality$Total_Deaths = as.numeric(uncensored_mortality$Total_Deaths)
theta = mean(uncensored_mortality$Total_Deaths*100000/uncensored_mortality$Population)
By using all the data, I obtained a crude initial guess for \(\lambda\) of about 1.08. Using this initial estimate \(\lambda\), we calculated the expected value for each \(Z_i\) to get an initial imputed dataset. This dataset will be used to estimate a Poisson regression model which will give us our initial value for our actual parameters of interest \(\beta\).
FUNCTION FOR IMPUTING CENSORED VALUE BASED ON EXPECTATION GIVEN LAMBDA
impute_small_values = function(lambda){
x = 1:10
p = dpois(x,lambda)
value = sum(x*p)/sum(p)
return(value)
}
INITIAL IMPUTATION:
mortality2 = mortality
mortality2$Total_Deaths[censored_idx] = 0.01
mortality2$Total_Deaths = as.numeric(mortality2$Total_Deaths)
for (i in censored_idx){
lambda = theta*mortality2$Population[i] / 100000
deaths = impute_small_values(lambda)
mortality2$Total_Deaths[i] = floor(deaths)
}
INITIAL REGRESSION MODELS:
mortality2$Age = factor(mortality2$Age)
mortality2$Cause_of_Death = factor(mortality2$Cause_of_Death)
mortality2$Month = factor(mortality2$Month)
pois_reg = glm(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), family = "poisson", data = mortality2)
# ZIP_reg = zeroinfl(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop) | 1, data = mortality2, dist = "poisson", link = "logit")
vec0 = coef(pois_reg)
# vec0 = coef(ZIP_reg)
summary(pois_reg)
##
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop),
## family = "poisson", data = mortality2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.4252 -0.5207 -0.2124 -0.0650 14.5038
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.744433 0.087789 -156.562 < 2e-16
## Age15 - 24 years 0.568750 0.095519 5.954 2.61e-09
## Age25 - 34 years 1.081482 0.091812 11.779 < 2e-16
## Age35 - 44 years 1.667882 0.090131 18.505 < 2e-16
## Age45 - 54 years 2.381811 0.088827 26.814 < 2e-16
## Age5 - 14 years 0.236541 0.099117 2.386 0.017
## Age55 - 64 years 3.136398 0.088079 35.609 < 2e-16
## Age65 - 74 years 4.183664 0.087714 47.697 < 2e-16
## Age75 - 84 years 5.352522 0.087582 61.115 < 2e-16
## Age85 years and over 6.499169 0.087521 74.258 < 2e-16
## AgeLess than 1 year 0.965400 0.138658 6.962 3.34e-12
## Cause_of_DeathInfluenza and pneumonia -0.751349 0.007035 -106.798 < 2e-16
## Month2 -0.253385 0.013643 -18.573 < 2e-16
## Month3 -0.251082 0.013634 -18.416 < 2e-16
## Month4 -0.450337 0.014455 -31.155 < 2e-16
## Month5 -0.522552 0.014781 -35.352 < 2e-16
## Month6 -0.646664 0.015382 -42.040 < 2e-16
## Month7 -0.661526 0.015458 -42.796 < 2e-16
## Month8 -0.709630 0.015707 -45.180 < 2e-16
## Month9 -0.756347 0.015957 -47.400 < 2e-16
## Month10 -0.651645 0.015407 -42.294 < 2e-16
## Month11 -0.617588 0.015237 -40.532 < 2e-16
## Month12 -0.357779 0.014059 -25.449 < 2e-16
##
## (Intercept) ***
## Age15 - 24 years ***
## Age25 - 34 years ***
## Age35 - 44 years ***
## Age45 - 54 years ***
## Age5 - 14 years *
## Age55 - 64 years ***
## Age65 - 74 years ***
## Age75 - 84 years ***
## Age85 years and over ***
## AgeLess than 1 year ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2 ***
## Month3 ***
## Month4 ***
## Month5 ***
## Month6 ***
## Month7 ***
## Month8 ***
## Month9 ***
## Month10 ***
## Month11 ***
## Month12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 382636 on 91871 degrees of freedom
## Residual deviance: 74194 on 91849 degrees of freedom
## AIC: 126734
##
## Number of Fisher Scoring iterations: 7
# summary(ZIP_reg)
Now that we have initialized our parameters, \(\beta^{(0)}\), we can proceed with the EM algorithm until our parameters (the coefficients of our regression model), converge.
The main steps implemented in the chunk below are:
Given a newly fitted Poisson regression model with parameter values \(\beta^{(t)}\), take its fitted values for the \(\lambda\)’s corresponding to observations that were censored in the original mortality dataset
Use those fitted \(\lambda\)’s, calculate the expected value of our unknown values \(Z\)
Once all \(Z_i\)’s are imputed, we can use the now complete dataset to estimate the Poisson regression model again, which will produce the maximum likelihood estimate of our parameters \(\beta\), these are our new values \(\beta^{(t+1)}\).
Compare the difference between our new \(\beta\) coefficient estimates with those from the previous iteration and either perform another iteration or stop the algorithm if the maximum difference between coefficients from different iterations is less than 0.01.
Note: I experimented with a ZIP regression model as well but the log likelihood values at each iteration were generally higher for the Poisson regression model
mortality3 = mortality2
model = pois_reg
model_diff = 100
iter = 1
vec0 = coef(model)
while((model_diff > 0.01) & (iter < 10)){
#impute data (should be between 1-10)
fvs = fitted.values(model)
for (i in censored_idx){
deaths = impute_small_values(fvs[i])
mortality3$Total_Deaths[i] = floor(deaths)
}
#fit model on "new" data
model = glm(Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop), family = "poisson", data = mortality3)
vec1 = coef(model)
model_diff = max(abs(vec1 - vec0))
iter = iter+1
vec0 = vec1
}
final_pois_reg = model
summary(final_pois_reg)
##
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop),
## family = "poisson", data = mortality3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9268 -0.3514 -0.1387 -0.0449 12.3014
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.351349 0.116534 -123.152 < 2e-16
## Age15 - 24 years -0.054190 0.135792 -0.399 0.689845
## Age25 - 34 years 0.430115 0.127371 3.377 0.000733
## Age35 - 44 years 1.210281 0.122028 9.918 < 2e-16
## Age45 - 54 years 2.345599 0.118274 19.832 < 2e-16
## Age5 - 14 years -0.455007 0.145826 -3.120 0.001807
## Age55 - 64 years 3.744079 0.116771 32.063 < 2e-16
## Age65 - 74 years 5.001304 0.116457 42.946 < 2e-16
## Age75 - 84 years 6.168765 0.116379 53.006 < 2e-16
## Age85 years and over 7.285988 0.116346 62.623 < 2e-16
## AgeLess than 1 year 1.536529 0.158560 9.691 < 2e-16
## Cause_of_DeathInfluenza and pneumonia -0.719013 0.006477 -111.013 < 2e-16
## Month2 -0.245579 0.012764 -19.240 < 2e-16
## Month3 -0.234760 0.012726 -18.448 < 2e-16
## Month4 -0.416647 0.013416 -31.057 < 2e-16
## Month5 -0.488946 0.013715 -35.649 < 2e-16
## Month6 -0.603208 0.014221 -42.418 < 2e-16
## Month7 -0.631446 0.014352 -43.998 < 2e-16
## Month8 -0.672621 0.014547 -46.237 < 2e-16
## Month9 -0.708569 0.014723 -48.128 < 2e-16
## Month10 -0.618620 0.014292 -43.285 < 2e-16
## Month11 -0.580590 0.014118 -41.125 < 2e-16
## Month12 -0.337932 0.013106 -25.784 < 2e-16
##
## (Intercept) ***
## Age15 - 24 years
## Age25 - 34 years ***
## Age35 - 44 years ***
## Age45 - 54 years ***
## Age5 - 14 years **
## Age55 - 64 years ***
## Age65 - 74 years ***
## Age75 - 84 years ***
## Age85 years and over ***
## AgeLess than 1 year ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2 ***
## Month3 ***
## Month4 ***
## Month5 ***
## Month6 ***
## Month7 ***
## Month8 ***
## Month9 ***
## Month10 ***
## Month11 ***
## Month12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 455249 on 91871 degrees of freedom
## Residual deviance: 38553 on 91849 degrees of freedom
## AIC: 97178
##
## Number of Fisher Scoring iterations: 8
logLik(final_pois_reg)
## 'log Lik.' -48566.18 (df=23)
Alternatively, we can take the first and second derivative and apply a Newton-Raphson procedure if we want to solve for \(\beta\) numerically. To solve for the fixed point solutions for \(\beta\) numerically, we would first take the derivative of \(Q()\) wrt each \(\beta_j\) which gives us:
\(\frac{dQ}{d\beta_j} = \sum_{i \in D \backslash W}^{n_{obs}}[y_i - exp(\alpha_i + x_i \beta)]x_{ij} + \sum_{i \in W}^N [\tilde y_i - exp(\alpha_i + x_i \beta)]x_{ij}\) where \(\alpha_i\) represents the offset associated with each observation \(i\) and \(\tilde y_i\) represents the imputed values corresponding to censored observation \(Z_i\)
Rewriting this in vector form (since we have 3 covariates, which are all categorical) we obtain the following gradient:
\(f'(\beta) = \frac{dQ}{d \overrightarrow \beta} = X^T [\overrightarrow y - exp(\overrightarrow \alpha + X \beta)]\) where first term is a \(p\times n\) matrix and second term is a \(n\times 1\) vector
Then, we take another derivative to get the Hessian:
\(f''_{jj'}(\beta) = -\sum_{i=1}^N exp(\alpha_i + x_i \beta) X_{ij} X_{ij'} = -X^T diag(exp(\overrightarrow \alpha + X \beta)) X\)
Finally, we solve for the next value for our \(\beta\)s using these two values with the following equation:
\(\beta^{(b)} = \beta^{(b-1)} - [(f''(\beta^{(b-1)}))^{-1} f'(\beta^{(b-1)})]\)
Now we need to code up a Newton Raphson function and initialize it:
Y = mortality2$Total_Deaths
#X is design matrix w col of 1s then each level of each categorical predictors except their baselines
intercept = rep(1,length(Y))
#Age categories
Age1524 = as.numeric(mortality2$Age == "15 - 24 years")
Age2534 = as.numeric(mortality2$Age == "25 - 34 years")
Age3544 = as.numeric(mortality2$Age == "35 - 44 years")
Age4554 = as.numeric(mortality2$Age == "45 - 54 years")
Age514 = as.numeric(mortality2$Age == "5 - 14 years")
Age5564 = as.numeric(mortality2$Age == "55 - 64 years")
Age6574 = as.numeric(mortality2$Age == "65 - 74 years")
Age7584 = as.numeric(mortality2$Age == "75 - 84 years")
Age85 = as.numeric(mortality2$Age == "85 years and over")
Age1 = as.numeric(mortality2$Age == "Less than 1 year")
#Cause of death categories
Cause2 = as.numeric(mortality2$Cause_of_Death == "Influenza and pneumonia")
#Month categories
Month2 = as.numeric(mortality2$Month == 2)
Month3 = as.numeric(mortality2$Month == 3)
Month4 = as.numeric(mortality2$Month == 4)
Month5 = as.numeric(mortality2$Month == 5)
Month6 = as.numeric(mortality2$Month == 6)
Month7 = as.numeric(mortality2$Month == 7)
Month8 = as.numeric(mortality2$Month == 8)
Month9 = as.numeric(mortality2$Month == 9)
Month10 = as.numeric(mortality2$Month == 10)
Month11 = as.numeric(mortality2$Month == 11)
Month12 = as.numeric(mortality2$Month == 12)
X = cbind(intercept,Age1524,Age2534,Age3544,Age4554,Age514,Age5564,Age6574,Age7584,Age85,Age1,
Cause2,Month2,Month3,Month4,Month5,Month6,Month7,Month8,Month9,Month10,
Month11,Month12)
# dim(X)
offset_vec = offset(mortality2$logpop)
offset_vec = matrix(offset_vec,ncol=1)
#Initial guesses for beta
B = coef(pois_reg)
B = matrix(B,ncol=1)
#Define first derivative of Q function
f_gradient = function(Y,X,B){
value = t(X) %*% (Y - exp(X %*% B + offset_vec))
return(value)
}
# f_gradient(Y,X,B)
#Define second derivative of Q function
f_hessian = function(Y,X,B){
middle = as.numeric(exp(X %*% B + offset_vec))
X2 = X
for (i in 1:length(middle)){
X2[i,] = X[i,] * middle[i]
}
value = -t(X) %*% X2
return(value)
}
# dim(f_hessian(Y,X,B))
#Define Newton Raphson function and compute initial beta coefficient estimates
Newton_Raphson = function(Y,X,x0,tol = 0.001,eps = 0.01,max_iter = 100){
for (i in 1:max_iter){
g = f_gradient(Y,X,x0)
h = f_hessian(Y,X,x0)
value = abs(det(h))
if (value < eps){
break
}
x1 = x0 - (solve(h) %*% g)
# x1 = x0 - (solve(h) %*% g * (0.01 * 0.999^i)) #gradient descent is too large at each iteration so need to slow it down
if (max(abs(x1-x0)) <= tol){
return(x1)
}
x0 = x1
}
return(x0)
}
#Initial beta coefficient estimates
new_coefs = Newton_Raphson(Y,X,B)
Similar to above, now that we have initialized our parameters, \(\beta^{(0)}\), we can proceed with the EM algorithm until our parameters (the coefficients of our regression model), converge.
The main steps implemented in the chunk below are:
Given newly estimated parameter values \(\beta^{(t)}\) from the Newton-Raphson procedure above, take its fitted values for the \(\lambda\)’s corresponding to observations that were censored in the original mortality dataset
Use those fitted \(\lambda\)’s, calculate the expected value of our unknown values \(Z\)
Once all \(Z_i\)’s are imputed, we can use the now complete dataset to estimate the beta coefficients with Newton-Raphson again, which will produce the maximum likelihood estimate of our parameters \(\beta\), these are our new values \(\beta^{(t+1)}\).
Compare the difference between our new \(\beta\) coefficient estimates with those from the previous iteration and either perform another iteration or stop the algorithm if the maximum difference between coefficients from different iterations is less than 0.01.
while((model_diff > 0.01) & (iter < 10)){
#impute data (should be between 1-10)
fvs_NR = exp((X %*% new_coefs) + offset_vec)
for (i in censored_idx){
deaths = impute_small_values(fvs_NR[i])
Y[i] = floor(deaths)
}
new_coefs2 = Newton_Raphson(Y,X,new_coefs)
model_diff = max(abs(new_coefs2 - new_coefs))
new_coefs = new_coefs2
iter = iter+1
vec0 = vec1
}
new_coefs
## [,1]
## intercept -13.7444329
## Age1524 0.5687496
## Age2534 1.0814825
## Age3544 1.6678819
## Age4554 2.3818112
## Age514 0.2365414
## Age5564 3.1363978
## Age6574 4.1836642
## Age7584 5.3525223
## Age85 6.4991688
## Age1 0.9653963
## Cause2 -0.7513495
## Month2 -0.2533848
## Month3 -0.2510821
## Month4 -0.4503366
## Month5 -0.5225520
## Month6 -0.6466641
## Month7 -0.6615256
## Month8 -0.7096304
## Month9 -0.7563473
## Month10 -0.6516455
## Month11 -0.6175879
## Month12 -0.3577794
summary(final_pois_reg)
##
## Call:
## glm(formula = Total_Deaths ~ Age + Cause_of_Death + Month + offset(logpop),
## family = "poisson", data = mortality3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.9268 -0.3514 -0.1387 -0.0449 12.3014
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.351349 0.116534 -123.152 < 2e-16
## Age15 - 24 years -0.054190 0.135792 -0.399 0.689845
## Age25 - 34 years 0.430115 0.127371 3.377 0.000733
## Age35 - 44 years 1.210281 0.122028 9.918 < 2e-16
## Age45 - 54 years 2.345599 0.118274 19.832 < 2e-16
## Age5 - 14 years -0.455007 0.145826 -3.120 0.001807
## Age55 - 64 years 3.744079 0.116771 32.063 < 2e-16
## Age65 - 74 years 5.001304 0.116457 42.946 < 2e-16
## Age75 - 84 years 6.168765 0.116379 53.006 < 2e-16
## Age85 years and over 7.285988 0.116346 62.623 < 2e-16
## AgeLess than 1 year 1.536529 0.158560 9.691 < 2e-16
## Cause_of_DeathInfluenza and pneumonia -0.719013 0.006477 -111.013 < 2e-16
## Month2 -0.245579 0.012764 -19.240 < 2e-16
## Month3 -0.234760 0.012726 -18.448 < 2e-16
## Month4 -0.416647 0.013416 -31.057 < 2e-16
## Month5 -0.488946 0.013715 -35.649 < 2e-16
## Month6 -0.603208 0.014221 -42.418 < 2e-16
## Month7 -0.631446 0.014352 -43.998 < 2e-16
## Month8 -0.672621 0.014547 -46.237 < 2e-16
## Month9 -0.708569 0.014723 -48.128 < 2e-16
## Month10 -0.618620 0.014292 -43.285 < 2e-16
## Month11 -0.580590 0.014118 -41.125 < 2e-16
## Month12 -0.337932 0.013106 -25.784 < 2e-16
##
## (Intercept) ***
## Age15 - 24 years
## Age25 - 34 years ***
## Age35 - 44 years ***
## Age45 - 54 years ***
## Age5 - 14 years **
## Age55 - 64 years ***
## Age65 - 74 years ***
## Age75 - 84 years ***
## Age85 years and over ***
## AgeLess than 1 year ***
## Cause_of_DeathInfluenza and pneumonia ***
## Month2 ***
## Month3 ***
## Month4 ***
## Month5 ***
## Month6 ***
## Month7 ***
## Month8 ***
## Month9 ***
## Month10 ***
## Month11 ***
## Month12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 455249 on 91871 degrees of freedom
## Residual deviance: 38553 on 91849 degrees of freedom
## AIC: 97178
##
## Number of Fisher Scoring iterations: 8
NEWTON RAPHSON APPROACH DOES NOT WORK WELL BC AT EACH ITERATION VALUES ARE CHANGING BY TOO MUCH, LEADS TO HESSIAN MATRIX BEING UNINVERTIBLE
Now that we have imputed the censored “< 11” values in the Cal-ViDa dataset, we will now aggregate the data to get total number of respiratory related deaths for each county for every month between 2014-2019.
print("Summary for respiratory related mortality")
## [1] "Summary for respiratory related mortality"
summary(mortality3$Total_Deaths) #summary of deaths per month, for all counties i.e. all of CA, age groups, and months 2014-2019
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.178 0.000 209.000
print("Summary for EPA data")
## [1] "Summary for EPA data"
for (i in pollutants$parametercodes.code){
data = final_EPA_data %>% filter(Pollutant == i)
print(summary(data$Value)) #summary of values for each pollutant, for all counties i.e. all of CA, all months (2014-2019)
AQI = data$AQI
}
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0120 0.0140 0.0160 0.0184 0.0190 0.0655
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2292 0.3000 0.3458 0.3719 0.4333 0.6500
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2636 0.4864 0.6545 0.6448 0.7773 1.1318
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.402 4.939 7.139 8.351 11.071 21.668
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.008118 0.020000 0.026412 0.025109 0.028948 0.037000
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.00 13.00 18.00 20.09 25.00 70.50
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.200 6.700 7.800 8.546 9.600 23.350
summary(AQI)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 33.00 36.00 38.34 40.50 75.00
print("Summary for SDI data")
## [1] "Summary for SDI data"
summary(soa.data$Score) #summary of SDI score, for all counties i.e. all of CA, all years 2010-2019
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 67.38 93.45 106.95 112.53 131.69 183.67
First, we wanted to combine the number of deaths from the two different causes into a total number of respiratory related deaths for each age group. Then, we combined the total number of deaths for each age group for a given month and county in a singular total for that month and county.
#Aggregating by cause of death
data = mortality3
data1 = data %>% filter(Cause_of_Death == "Chronic lower respiratory diseases")
data2 = data %>% filter(Cause_of_Death == "Influenza and pneumonia")
newdeaths = data1$Total_Deaths + data2$Total_Deaths
data1$Total_Deaths = newdeaths
respmortality = data1[,-5]
#Creates total deaths by adding deaths of all age groups together
agg.respmortality = respmortality[1,]
agg.respmortality$Age = as.character(agg.respmortality$Age)
rows2 = seq(1,nrow(respmortality),11)
for (i in rows2){
agg.respmortality[i,] = respmortality[i,]
agg.respmortality$Total_Deaths[i] = sum(respmortality$Total_Deaths[i:(i+10)])
agg.respmortality$Age[i] = "Everyone"
}
agg.respmortality = na.omit(agg.respmortality)
rownames(agg.respmortality) = NULL
total.respmortality = agg.respmortality[,-c(6:9)]
Reformat dataset into time series format (rows are counties, columns are months)
Total deaths:
months = unique(total.respmortality$Month_of_Death)
years = sort(unique(total.respmortality$Year_of_Death))
counties = unique(total.respmortality$County_of_Death)
x = 0
total.mortality.ts = matrix(1,nrow = 58, ncol = 72)
for (k in counties){
county.ts = c()
x = x+1
for (i in years){
for (j in months){
deaths = total.respmortality %>% filter(County_of_Death == k) %>% filter(Year_of_Death == i) %>% filter(Month_of_Death == j) %>% select(Total_Deaths) %>% as.numeric()
county.ts = c(county.ts,deaths)
}
}
total.mortality.ts[x,] = county.ts
}
#Label time series data
total.mortality.ts = as.data.frame(total.mortality.ts)
dates = c()
x=1
for (i in years){
for (j in months){
dates[x] = sprintf("%1.0f/%1.0f",j,i)
x = x+1
}
}
colnames(total.mortality.ts) = dates
ID = c(1:58)
total.mortality.ts = cbind(ID,counties,total.mortality.ts)
total.mortality.ts = left_join(clusterlabels,total.mortality.ts,by = "counties")
head(total.mortality.ts)
## counties Cluster ID 1/2014 2/2014 3/2014 4/2014 5/2014 6/2014 7/2014 8/2014
## 1 Alameda 4 1 84 61 57 57 55 44 44 44
## 2 Alpine 5 2 0 0 0 0 0 0 0 0
## 3 Amador 5 3 2 3 2 2 2 1 3 1
## 4 Butte 2 4 15 10 11 8 5 6 7 6
## 5 Calaveras 5 5 3 4 2 3 0 0 1 0
## 6 Colusa 1 6 0 1 1 0 1 0 0 0
## 9/2014 10/2014 11/2014 12/2014 1/2015 2/2015 3/2015 4/2015 5/2015 6/2015
## 1 42 51 50 62 101 72 63 61 63 46
## 2 0 0 0 0 0 0 0 0 0 0
## 3 1 3 2 1 3 3 3 5 4 4
## 4 7 5 7 9 13 8 10 19 6 8
## 5 1 2 1 1 2 0 2 2 1 2
## 6 1 0 0 0 1 2 0 1 0 0
## 7/2015 8/2015 9/2015 10/2015 11/2015 12/2015 1/2016 2/2016 3/2016 4/2016
## 1 44 40 36 41 41 64 78 56 62 55
## 2 0 0 0 0 0 0 0 0 0 0
## 3 2 2 2 2 2 4 3 3 2 2
## 4 6 10 3 7 8 6 13 6 8 7
## 5 2 1 2 0 1 4 3 2 4 2
## 6 0 0 1 1 1 2 1 0 1 0
## 5/2016 6/2016 7/2016 8/2016 9/2016 10/2016 11/2016 12/2016 1/2017 2/2017
## 1 39 44 47 43 37 53 48 60 114 66
## 2 0 0 0 0 0 1 0 0 0 0
## 3 0 0 1 1 2 0 2 2 4 2
## 4 18 4 4 6 7 6 8 8 11 11
## 5 3 2 3 2 1 0 0 3 2 1
## 6 0 0 0 0 0 0 0 2 0 1
## 3/2017 4/2017 5/2017 6/2017 7/2017 8/2017 9/2017 10/2017 11/2017 12/2017
## 1 62 59 45 45 48 37 41 40 44 62
## 2 0 0 0 0 0 0 0 0 0 0
## 3 1 3 3 3 4 2 2 1 1 2
## 4 11 10 8 10 6 8 5 10 7 9
## 5 2 4 2 2 1 2 1 0 0 3
## 6 0 0 0 1 0 1 0 0 0 1
## 1/2018 2/2018 3/2018 4/2018 5/2018 6/2018 7/2018 8/2018 9/2018 10/2018
## 1 114 58 62 52 50 46 49 39 39 49
## 2 0 0 0 0 0 0 0 0 0 0
## 3 6 2 2 0 2 3 1 1 2 3
## 4 13 10 10 9 8 7 6 8 7 8
## 5 3 4 3 2 2 2 1 1 5 3
## 6 0 0 1 1 2 0 0 0 1 0
## 11/2018 12/2018 1/2019 2/2019 3/2019 4/2019 5/2019 6/2019 7/2019 8/2019
## 1 41 49 60 61 56 60 38 43 48 45
## 2 1 0 0 0 0 0 0 0 0 0
## 3 3 2 4 2 2 4 0 1 3 0
## 4 8 6 10 8 8 7 7 8 6 6
## 5 2 1 3 3 2 1 1 4 0 2
## 6 0 0 2 0 0 1 1 1 0 1
## 9/2019 10/2019 11/2019 12/2019
## 1 38 44 47 68
## 2 0 0 0 0
## 3 2 1 2 3
## 4 5 5 8 7
## 5 3 1 1 0
## 6 1 0 1 2
HOW MANY 0s DOES EACH COUNTY HAVE?
numzeros_total = c()
for (i in 1:58){
numzeros_total[i] = length(which(total.mortality.ts[i,3:74] == 0))
}
numzeros_total
## [1] 0 69 6 0 10 42 0 8 0 0 32 0 0 20 0 1 0 31 0 0 0 31 1 0 25
## [26] 62 0 0 1 0 0 25 0 0 13 0 0 0 0 0 0 0 0 0 0 61 2 0 0 0
## [51] 3 2 38 0 3 0 0 0
propzeros_total = numzeros_total/72
length(which(propzeros_total > 0.85))
## [1] 2
countycodes$value_represented[which(propzeros_total > 0.75)]
## [1] "Alpine" "Mono" "Sierra"
hist(propzeros_total,breaks = 20 ,xlab = "Proportion of months with 0 deaths",main = "Do some counties have more strings of 0s than others?")
One aspect of the data that we wanted to examine before proceeding with our analysis was the frequency in which there were 0 deaths in a given county for a month. This would inform us about whether a standard Poisson model or a zero inflated Poisson model would be more appropriate. What I did above was first calculate the proportion of months (out of 72) that had 0 deaths observed for each county. Then identified which counties had a proportion of 0s greater than 75%, 85%, etc. Then, I made a histogram which shows there are only a couple of counties (which have very small populations) that had a high frequency of 0s. The aggregation performed in previous steps addressed the zero inflation it appears.
AGGREGATING MORTALITY DATA INTO CLUSTERS AS OPPOSED TO EACH COUNTY (ALSO AGGREGATED TO MORTALITY RATE PER CLUSTER)
Again, we want the number of respiratory related deaths at the cluster level, not the county level. So we once again aggregate the observations for each county in a given cluster. First, we simply add all the observations in cluster together to get a total number of respiratory related deaths for the months of 2014-2019 for each cluster. Then, we also calculated a mortality rate (per 100k people) for each cluster. This was done by taking the total number of deaths for a given cluster and dividing it by the total population of that cluster times 100,000 i.e. (deaths\(*\frac{100000}{clusterpop}\)). This second dataset will be used for our Gaussian process regression model which needs to be fit on a continuous response variable.
#County populations by year pulled from SoA data
countypops = CA_data %>% filter(Year > 2013) %>% select(Total_Pop,County,Year) %>% unique()
countypops = cbind(countypops,Cluster = rep(clusterlabels$Cluster,each=6))
cluster_mortality_total = matrix(NA,nrow = 72,ncol = num_clus)
cluster_mortality_rate = matrix(NA,nrow = 72,ncol = num_clus)
for (i in 1:num_clus){
cluster = total.mortality.ts %>% filter(Cluster == i)
year = 2014
for(j in 1:72){
col = cluster[,j+3]
#Sum of deaths across counties in a cluster
cluster_mortality_total[j,i] = sum(na.omit(col))
#Rate of deaths (per 100,000) across counties in a cluster
pops = countypops %>% filter(Year == year,Cluster == i) %>% select(Total_Pop)
cluster.pop = sum(pops)
cluster_mortality_rate[j,i] = (sum(na.omit(col))/cluster.pop)*100000
if ((j>12) & (j<25)){
year = 2015
}
else if ((j>24) & (j<37)){
year = 2016
}
else if ((j>36) & (j<49)){
year = 2017
}
else if ((j>48) & (j<61)){
year = 2018
}
else if ((j>60) & (j<73)){
year = 2019
}
else{
year = 2014
}
}
}
#Time series of total deaths for each cluster
colnames(cluster_mortality_total) = c("Cluster 1","Cluster 2","Cluster 3",
"Cluster 4","Cluster 5","Cluster 6",
"Cluster 7")
rownames(cluster_mortality_total) = colnames(total.mortality.ts[4:75])
cluster_mortality_total = data.frame(cluster_mortality_total)
head(cluster_mortality_total)
## Cluster.1 Cluster.2 Cluster.3 Cluster.4 Cluster.5 Cluster.6 Cluster.7
## 1/2014 55 80 80 601 310 15 1101
## 2/2014 39 51 44 465 250 6 867
## 3/2014 32 47 42 437 234 15 854
## 4/2014 36 45 38 395 203 12 705
## 5/2014 31 45 36 360 195 17 710
## 6/2014 25 37 43 308 166 9 663
#Time series of rate of deaths (per 100,000) for each cluster
colnames(cluster_mortality_rate) = c("Cluster 1","Cluster 2","Cluster 3",
"Cluster 4","Cluster 5","Cluster 6",
"Cluster 7")
rownames(cluster_mortality_rate) = colnames(total.mortality.ts[4:75])
cluster_mortality_rate = data.frame(cluster_mortality_rate)
head(cluster_mortality_rate)
## Cluster.1 Cluster.2 Cluster.3 Cluster.4 Cluster.5 Cluster.6 Cluster.7
## 1/2014 5.431745 6.635007 6.711308 5.644215 5.944301 6.013784 5.891607
## 2/2014 3.851601 4.229817 3.691219 4.366988 4.793791 2.405513 4.639440
## 3/2014 3.160288 3.898066 3.523437 4.104030 4.486989 6.013784 4.569875
## 4/2014 3.555324 3.732191 3.187871 3.709592 3.892558 4.811027 3.772555
## 5/2014 3.061529 3.732191 3.020089 3.380894 3.739157 6.815621 3.799310
## 6/2014 2.468975 3.068691 3.607328 2.892543 3.183077 3.608270 3.547807
MAKE A TIME SERIES FOR EACH CLUSTER:
plot(ts(cluster_mortality_total$Cluster.1),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 1")
plot(ts(cluster_mortality_total$Cluster.2),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 2")
plot(ts(cluster_mortality_total$Cluster.3),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 3")
plot(ts(cluster_mortality_total$Cluster.4),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 4")
plot(ts(cluster_mortality_total$Cluster.5),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 5")
plot(ts(cluster_mortality_total$Cluster.6),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 6")
plot(ts(cluster_mortality_total$Cluster.7),xlab = "2014-2019 (Months)",ylab = "Respiratory related deaths", main = "Monthly population weighted means for Cluster 7")
Now that all of the data from the SoA (used for SKATER and HUGE to get graph filter H), EPA (used to get gram matrix K), and Cal-ViDa (response) is downloaded, cleaned, and well formatted, we can now fit our kernel graph regression model as well as a few reference models, which we will compare against each other. For now, I have implemented a training-test data fitting approach to evaluating model performance.
First, I created an in sample dataset (inla_insample_data) which has variables ID (which represents the cluster label), ID2 (which is basically an index label), response (cluster mortality), time (time index label 1-66), and months (month label 1-12). Then, I create an out of sample dataset (inla_outsample_data) in which I decided to hold out the last 6 months of the data (67-72 or July-Dec 2019) so I replaced those response values with NAs. This is how you get INLA to make predictions/forecasts because it does so based on the posterior predictive distribution.
cluster_mortality_total_red = cluster_mortality_total[13:72,]
cluster_mortality_rate_red = cluster_mortality_rate[13:72,]
response = t(cluster_mortality_total_red)
response = as.vector(response)
response = ceiling(response)
response2 = t(cluster_mortality_rate_red)
response2 = as.vector(response2)
id = rep(c(1:7),60)
id2 = 1:(7*60)
time = rep(c(1:60),each = 7)
inla_full_data = data.frame(id,id2,response,time)
inla_full_data2 = data.frame(id,id2,response2,time)
months = rep(c(1:12),each = 7)
months = rep(months,5)
inla_full_data = cbind(inla_full_data,months)
#Experimented with defining each of these as factors
# inla_full_data$id = factor(inla_full_data$id)
# inla_full_data$id2 = factor(inla_full_data$id2)
# inla_full_data$time = factor(inla_full_data$time)
inla_full_data$months = factor(inla_full_data$months)
#Add multiple intercept columns, one for each cluster
Intercept1 = rep(c(1,NA,NA,NA,NA,NA,NA),60)
Intercept2 = rep(c(NA,1,NA,NA,NA,NA,NA),60)
Intercept3 = rep(c(NA,NA,1,NA,NA,NA,NA),60)
Intercept4 = rep(c(NA,NA,NA,1,NA,NA,NA),60)
Intercept5 = rep(c(NA,NA,NA,NA,1,NA,NA),60)
Intercept6 = rep(c(NA,NA,NA,NA,NA,1,NA),60)
Intercept7 = rep(c(NA,NA,NA,NA,NA,NA,1),60)
inla_full_data = cbind(inla_full_data,Intercept1,Intercept2,Intercept3,Intercept4,
Intercept5,Intercept6,Intercept7)
# response = replace_na(response,0.01)
inla_gp_data = inla_full_data2
year = rep(2015:2019,each = 12)
inla_gp_data = cbind(inla_gp_data,year)
###Split into in sample and out of sample dataset
inla_outsample_data = inla_full_data
#Omit values for months 61-66 (out of sample dataset)
omit_idx = which(inla_outsample_data$time > 54)
inla_outsample_data$response[omit_idx] = NA
inla_insample_data = inla_full_data[-omit_idx,]
omit_idx = which(inla_gp_data$time > 54)
inla_gp_data$response2[omit_idx] = NA
We wanted to compare the performance of our proposed model with a few reference models. The first one is a Poisson generalized linear mixed model. This model assumes the observed data follows a Poisson distribution and the hyperparameter \(\lambda_i\) can be modeled using a mixed effects model with a log link.
In other words,
\(Y_{c,t} \sim Pois(\Lambda_{c,t})\) for \(c=1,...,7\) and \(t=1,...,54\) where \(\Lambda_{c,t} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{C} I \{ c=C \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_c)\)
where the random effect \(\textbf{F} | \tau \sim \mathcal{GP}(\textbf{0},\tau \boldsymbol{\Sigma})\)
We wanted the first reference model to be simple, so we assumed that the random effects \(\textbf{F}_c\) are iid. This means that \(\Sigma\) is simply a diagonal matrix of scaling factors. The hyperparameter \(log(\tau)\) is by default assigned a \(log \; \Gamma(1,0.00005)\) prior.
#Write a function to fit our poisson glmm in INLA
ref_model1 = function(dataset,a_prior = 1,b_prior = 5e-05,link=1){
###Fit INLA model
prec_prior <- list(prec = list(prior = "loggamma", param = c(a_prior,b_prior)))
ref_formula1 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 + f(id,model = "iid", hyper = prec_prior) #could use id or id2
model = inla(formula = ref_formula1,family = "poisson",data = dataset,
control.compute = list(dic=TRUE,waic=TRUE),
control.inla = list(strategy = "laplace"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(dataset$id, dataset$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
ref_model1_results = list(
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model
)
return(ref_model1_results)
}
#Run model
ref_model1_fit = ref_model1(dataset = inla_insample_data,a_prior = 1,b_prior = 5e-5)
#Extract DIC and WAIC
ref_model1_DIC = ref_model1_fit$model_DIC
ref_model1_WAIC = ref_model1_fit$model_WAIC
#Get summaries of parameter estimates
ref_model1_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.1039998 7.254715 -12.123617 2.1039998 16.33162 2.1039998
## months2 1.8618772 7.254716 -12.365742 1.8618772 16.08950 1.8618772
## months3 1.8812181 7.254716 -12.346401 1.8812181 16.10884 1.8812181
## months4 1.6930264 7.254718 -12.534595 1.6930264 15.92065 1.6930264
## months5 1.6124055 7.254718 -12.615218 1.6124055 15.84003 1.6124055
## months6 1.4969693 7.254719 -12.730656 1.4969693 15.72459 1.4969693
## months7 1.4681954 7.254722 -12.759435 1.4681954 15.69583 1.4681954
## months8 1.4344602 7.254723 -12.793171 1.4344602 15.66209 1.4344602
## months9 1.4008382 7.254723 -12.826794 1.4008382 15.62847 1.4008382
## months10 1.4724084 7.254722 -12.755222 1.4724084 15.70004 1.4724084
## months11 1.5162769 7.254722 -12.711352 1.5162769 15.74391 1.5162769
## months12 1.7814817 7.254719 -12.446142 1.7814817 16.00911 1.7814817
## Intercept1 1.8619680 7.254756 -12.365728 1.8619680 16.08966 1.8619680
## Intercept2 2.1524555 7.254747 -12.075225 2.1524555 16.38014 2.1524555
## Intercept3 2.0751101 7.254749 -12.152574 2.0751101 16.30279 2.0751101
## Intercept4 4.2895287 7.254726 -9.938109 4.2895287 18.51717 4.2895287
## Intercept5 3.6523902 7.254728 -10.575252 3.6523902 17.88003 3.6523902
## Intercept6 0.6814357 7.254830 -13.546407 0.6814357 14.90928 0.6814357
## Intercept7 5.0102688 7.254724 -9.217366 5.0102688 19.23790 5.0102688
## kld
## months1 5.527842e-11
## months2 5.527826e-11
## months3 5.527840e-11
## months4 5.527838e-11
## months5 5.527851e-11
## months6 5.527836e-11
## months7 5.527831e-11
## months8 5.527831e-11
## months9 5.527843e-11
## months10 5.527845e-11
## months11 5.527832e-11
## months12 5.527836e-11
## Intercept1 5.527847e-11
## Intercept2 5.527834e-11
## Intercept3 5.527843e-11
## Intercept4 5.527822e-11
## Intercept5 5.527837e-11
## Intercept6 5.527842e-11
## Intercept7 5.527845e-11
ref_model1_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id 0.01157621 0.009997557 0.003655985 0.008445019 0.04078183 0.005739866
ref_model1_fit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.198898 6.435807 6.561492 5.435907 5.014860 4.468127 4.341394
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.197379 4.058601 4.359723 4.555234 5.938649 6.436391 8.605964
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.965424 72.932088 38.566739 1.976714 149.945037
#Show plots
ref_model1_fit$param_plot
ref_model1_fit$hyperparam_plot
Note: SD for ID: standard deviation for the means (avg intensities) corresponding to the 8 different clusters was 0.116
Plot of posterior predictive estimates with credible interval bands OVERLAID on response:
#Write a function to make plot of posterior predictive estimates with credible interval bands OVERLAID on response
pp_insample_plot = function(num_plots = num_clus, ref_data = inla_insample_data, pred_data){
for (i in 1:num_plots){
df = ref_data %>% filter(id == i) %>% select(response)
preds = pred_data %>% filter(id == i)
df = cbind(df,preds)
# title = sprintf("Posterior Predictive Fits for Cluster %s",i)
title = sprintf("Cluster %s",i)
post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() +
geom_line(aes(y=mean),color = "red") + geom_ribbon(aes(ymin = `0.025quant`,ymax = `0.975quant`),alpha = 0.3) + ggtitle(title)
print(post_pred_plot)
}
}
#Plot ref_model1 pp plot
pp_insample_plot(pred_data = ref_model1_fit$fitted_values)
For our second reference model, we decided to fit a Besag-York-Mollie model, which is a log-normal Poisson model with an intrinsic conditional autoregressive component to capture spatial autocorrelations i.e. a Besag model, plus a standard random effects term which is included to capture non-spatial heterogeneity. Obviously, this model is less naive than reference model 1 because it does not assume iid random effects.
The BYM model can be written as,
\(Y_{c,t} \sim Pois(\Lambda_{c,t})\) for \(c=1,...,7\) and \(t=1,...,54\) where \(\Lambda_{c,t} | \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{C} I \{ c=C \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \phi_c + \textbf{F}_c)\)
where \(p(\boldsymbol{\phi}) \propto \exp(-\frac{1}{2} \sum_{c_1 \sim c_2} (\phi_{c_1} - \phi_{c_2})^2)\) and \(\textbf{F} | \; \textbf{S}, \tau \sim \mathcal{GP}(\textbf{0},\tau \boldsymbol{\Sigma})\).
Note: it is more commonly known that ICAR components are conditionally normally distributed.
As one can see below, the summary outputs indicate that this model is very similar to the Poisson GLMM (reference model 1). The intercept and SD for the random effect component are estimated to almost the exact same as those estimated by the Poisson GLMM, indicating that including the spatial ICAR component is seemingly not very impactful.
#Write a function to fit our BYM model in INLA
ref_model2 = function(dataset,a_prec_prior = 1,b_prec_prior = 5e-04,a_phi_prior = 1,b_phi_prior = 5e-04,link=1){
###Fit INLA model
bym_prior <- list(
prec.unstruct = list(
prior = "loggamma",
param = c(a_prec_prior,b_prec_prior)),
prec.spatial = list(
prior = "loggamma",
param = c(a_phi_prior,b_phi_prior))
)
ref_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 +
f(id, model = "bym", graph = huge.est, hyper = bym_prior) #ID2 in formula results in error
model = inla(formula = ref_formula2,family = "poisson",data = dataset,
control.compute = list(dic=TRUE,waic=TRUE),
control.inla = list(strategy = "laplace"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(dataset$id, dataset$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
ref_model2_results = list(
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model
)
return(ref_model2_results)
}
#Fit ref_model2
ref_model2_fit = ref_model2(dataset = inla_insample_data,a_prec_prior = 1,b_prec_prior = 5e-4,
a_phi_prior = 1,b_phi_prior = 5e-4)
#Extract DIC and WAIC
ref_model2_DIC = ref_model2_fit$model_DIC
ref_model2_WAIC = ref_model2_fit$model_WAIC
#Get summaries of parameter estimates
ref_model2_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.1039920 7.254718 -12.123630 2.1039920 16.33161 2.1039920
## months2 1.8618694 7.254719 -12.365755 1.8618694 16.08949 1.8618694
## months3 1.8812103 7.254719 -12.346414 1.8812103 16.10883 1.8812103
## months4 1.6930185 7.254720 -12.534609 1.6930185 15.92065 1.6930185
## months5 1.6123976 7.254721 -12.615231 1.6123976 15.84003 1.6123976
## months6 1.4969614 7.254722 -12.730669 1.4969614 15.72459 1.4969614
## months7 1.4681875 7.254725 -12.759448 1.4681875 15.69582 1.4681875
## months8 1.4344523 7.254725 -12.793184 1.4344523 15.66209 1.4344523
## months9 1.4008304 7.254726 -12.826807 1.4008304 15.62847 1.4008304
## months10 1.4724006 7.254725 -12.755235 1.4724006 15.70004 1.4724006
## months11 1.5162690 7.254724 -12.711366 1.5162690 15.74390 1.5162690
## months12 1.7814739 7.254722 -12.446155 1.7814739 16.00910 1.7814739
## Intercept1 1.8619531 7.254905 -12.366036 1.8619531 16.08994 1.8619531
## Intercept2 2.1524453 7.254896 -12.075526 2.1524453 16.38042 2.1524453
## Intercept3 2.0750974 7.254904 -12.152890 2.0750974 16.30308 2.0750974
## Intercept4 4.2895309 7.254880 -9.938409 4.2895309 18.51747 4.2895309
## Intercept5 3.6523966 7.254892 -10.575567 3.6523966 17.88036 3.6523966
## Intercept6 0.6813729 7.254985 -13.546773 0.6813729 14.90952 0.6813729
## Intercept7 5.0102666 7.254873 -9.217660 5.0102666 19.23819 5.0102666
## kld
## months1 5.527839e-11
## months2 5.527836e-11
## months3 5.527835e-11
## months4 5.527834e-11
## months5 5.527832e-11
## months6 5.527831e-11
## months7 5.527827e-11
## months8 5.527840e-11
## months9 5.527853e-11
## months10 5.527840e-11
## months11 5.527854e-11
## months12 5.527846e-11
## Intercept1 5.527755e-11
## Intercept2 5.527768e-11
## Intercept3 5.527757e-11
## Intercept4 5.527766e-11
## Intercept5 5.527746e-11
## Intercept6 5.527755e-11
## Intercept7 5.527750e-11
ref_model2_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5
## SD for id (idd component) 0.03167266 0.01821742 0.01187799 0.02658754
## SD for id (spatial component) 0.03266254 0.01951004 0.01202534 0.02709359
## q0.975 mode
## SD for id (idd component) 0.08052979 0.01976978
## SD for id (spatial component) 0.08523203 0.01979828
ref_model2_fit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.198834 6.435756 6.561441 5.435864 5.014820 4.468092 4.341359
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.197346 4.058569 4.359688 4.555198 5.938603 6.436295 8.605877
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.965323 72.932251 38.566985 1.976590 149.944706
#Show plots
ref_model2_fit$param_plot
ref_model2_fit$hyperparam_plot
pp_insample_plot(pred_data = ref_model2_fit$fitted_values)
Finally, we fit our proposed model which we call a kernel graph regression model. It also takes the form of a latent Gaussian model as shown below:
\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{7} I \{ c=7 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)
where the graph signal \(\textbf{F} | \textbf{S}, \rho_{rbf}, \rho_{p}, \sigma^2_{EPA} \sim \mathcal{GP}(\textbf{0},\textbf{K}_{EPA} \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{EPA} \right]_{t_1,t_2} \left[ \textbf{H}^2 \right]_{c_1,c_2}\).
The key difference here is that the covariance matrix of this GP is specified by the kronecker product of \(K\), which is the time kernel gram matrix calculated from the EPA air quality data, and \(H\), which is the graph filter which is calculated from the adjacency matrix estimated by glasso using the HUGE package. This matrix has been estimated already so it can be directly plugged into INLA as the covariance matrix of our underlying GP using the “generic0” specification as shown below:
Using the EPA air quality data, we can calculate the gram matrix K which will characterize the dependence structure of air quality (across 7 different pollutants and AQI) over time. This is done by calculating the squared difference between all of the observations at two different time points e.g. 56 observations for Jan 2014 compared with the 64 observations for Feb 2014. For the in sample analysis, the resulting matrix is 54x54 because we are holding out the last 6 months of observations.
EPA_kernel = function(EPA_data = final_EPA_agg_data,time_span,rho_rbf,rho_periodic,sigma2){
K_EPA = matrix(0,nrow=time_span,ncol=time_span)
i = 1
j = 1
for(t1 in 1:time_span){
for (t2 in 1:time_span){
A = EPA_data %>% filter(Time == t1)
B = EPA_data %>% filter(Time == t2)
AQIa = unique(A$AQI)
AQIb = unique(B$AQI)
ABtest = c((A$Value-B$Value)^2,(AQIa-AQIb)^2) #7 clusters * 8 measurements
# K_EPA[i,j] = exp(-sum(ABtest) / (2*rho_rbf)) * sigma2
# K_EPA[i,j] = exp(- (sum(ABtest)) ###square this sum or remove it???
# / (2*rho_rbf)) * exp(- (2*sin(sum(abs(ABtest))*pi/12)^2)
# / (rho_periodic)) * sigma2
K_EPA[i,j] = exp(- (mean(ABtest)) ###mean or sum???
/ (2*rho_rbf)) * exp(- (2*sin(sum(abs(ABtest))*pi/12)^2)
/ (rho_periodic)) * sigma2
j = j+1
}
j = 1
i = i+1
}
return(K_EPA)
}
desingularize = function(covmatrix,threshold = 1e-2,increment = 0.01){
tracker = 0
while (rcond(covmatrix) <= threshold){
#Perform spectral decomposition
ev = eigen(covmatrix)
L = ev$values
V = ev$vectors
# #Add a little noise to eigenvalues to bring away from 0
L = L + increment
# #Add 0.01 to eigenvalues in bottom five percentile to bring away from 0
# cutoff = quantile(abs(L),0.05)
# L[which(abs(L) < cutoff)] = L[which(abs(L) < cutoff)] + 0.01
#Calculate new precision matrix
covmatrix = V %*% diag(L) %*% t(V)
tracker = tracker + 1
}
results_list = list(covmatrix,tracker)
#sprintf("%s iterations of desingularizer applied",tracker)
return(results_list)
}
# test = desingularize(K_time)
GLMM with type 0 generic specification (known covariance matrix)
kgr_model2 = function(dataset, rho_EPA_rbf = 1, rho_EPA_periodic = 1, sigma2_EPA = 1,link=1){
#Calculate gram matrix K_EPA
K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
rho_rbf = rho_EPA_rbf, rho_periodic = rho_EPA_periodic, sigma2 = sigma2_EPA)
#Heatmap of resulting K
K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA Heatmap")
#Calculate trace norm of gram matrix
K_EPA_weight = norm((1/60)*K_EPA,type = "F")
###Load graph regression kernel
covGP = kronecker(K_EPA/60,(H^2)/7)
#Need to ensure precision matrix is not computationally singular i.e det > 0
covGP_jittered = desingularize(covGP,threshold = 1e-2,increment = 0.01)
covGP = covGP_jittered[[1]]
inv_covGP = solve(covGP)
# cov_Fnorm = norm(covGP,type = "F")
#Heatmap of resulting K
inv_covGP_heatmap = matrix_heatmap(inv_covGP,title = "")
###Fit INLA model
# kgr_formula2 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
# Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP)
kgr_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP)
model = inla(formula = kgr_formula2,family = "poisson",data = dataset,
control.compute = list(dic=TRUE,waic=TRUE,
return.marginals.predictor=TRUE),
control.inla = list(strategy = "laplace"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(dataset$id, dataset$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
marginal_fvs <- model$marginals.fitted.values
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
kgr_model2_results = list(
K_EPA_heatmap = K_EPA_heatmap,
K_EPA_weight = K_EPA_weight/(K_EPA_weight + gfilter_weight),
gfilter_weight = gfilter_weight/(K_EPA_weight + gfilter_weight),
covmatrix = covGP,
prec = inv_covGP,
num_jitters = covGP_jittered[[2]],
prec_heatmap = inv_covGP_heatmap,
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model,
marg_fitted_values = marginal_fvs
)
return(kgr_model2_results)
}
#Fit kgr_model2
kgr_model2_fit = kgr_model2(dataset = inla_insample_data,rho_EPA_rbf = 23.009,rho_EPA_periodic = 5253.017, sigma2_EPA = 0.928)
#Extract DIC and WAIC
kgr_model2_DIC = kgr_model2_fit$model_DIC
kgr_model2_WAIC = kgr_model2_fit$model_WAIC
#Get summaries of parameter estimates
kgr_model2_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0988687 7.254730 -12.128777 2.0988687 16.32651 2.0988687
## months2 1.8695051 7.254731 -12.358143 1.8695051 16.09715 1.8695051
## months3 1.8910636 7.254731 -12.336585 1.8910636 16.11871 1.8910636
## months4 1.6964637 7.254733 -12.531188 1.6964637 15.92412 1.6964637
## months5 1.6160841 7.254734 -12.611570 1.6160841 15.84374 1.6160841
## months6 1.4917409 7.254735 -12.735916 1.4917409 15.71940 1.4917409
## months7 1.4552223 7.254742 -12.772448 1.4552223 15.68289 1.4552223
## months8 1.4342081 7.254742 -12.793462 1.4342081 15.66188 1.4342081
## months9 1.3928950 7.254743 -12.834777 1.3928950 15.62057 1.3928950
## months10 1.4733904 7.254742 -12.754279 1.4733904 15.70106 1.4733904
## months11 1.5164172 7.254741 -12.711251 1.5164172 15.74409 1.5164172
## months12 1.7762987 7.254738 -12.451362 1.7762987 16.00396 1.7762987
## Intercept1 1.8604593 7.254759 -12.367243 1.8604593 16.08816 1.8604593
## Intercept2 2.1517806 7.254758 -12.075920 2.1517806 16.37948 2.1517806
## Intercept3 2.0744176 7.254779 -12.153325 2.0744176 16.30216 2.0744176
## Intercept4 4.2859271 7.254753 -9.941764 4.2859271 18.51362 4.2859271
## Intercept5 3.6474903 7.254767 -10.580227 3.6474903 17.87521 3.6474903
## Intercept6 0.6819245 7.254874 -13.546003 0.6819245 14.90985 0.6819245
## Intercept7 5.0101584 7.254727 -9.217481 5.0101584 19.23780 5.0101584
## kld
## months1 5.527847e-11
## months2 5.527845e-11
## months3 5.527844e-11
## months4 5.527841e-11
## months5 5.527840e-11
## months6 5.527824e-11
## months7 5.527828e-11
## months8 5.527841e-11
## months9 5.527840e-11
## months10 5.527828e-11
## months11 5.527829e-11
## months12 5.527834e-11
## Intercept1 5.527843e-11
## Intercept2 5.527845e-11
## Intercept3 5.527840e-11
## Intercept4 5.527838e-11
## Intercept5 5.527831e-11
## Intercept6 5.527830e-11
## Intercept7 5.527821e-11
kgr_model2_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.651928 0.05113911 0.5562518 0.6501578 0.7571101 0.6467985
kgr_model2_fit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.156937 6.485086 6.626413 5.454624 5.033342 4.444827 4.285436
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.196321 4.026490 4.364006 4.555873 5.907949 6.426688 8.600158
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.959909 72.669888 38.378226 1.977680 149.928485
kgr_model2_fit$K_EPA_weight
## [1] 0.7751928
kgr_model2_fit$gfilter_weight
## [1] 0.2248072
kgr_model2_fit$num_jitters
## [1] 1
#Show plots
kgr_model2_fit$prec_heatmap
kgr_model2_fit$K_EPA_heatmap
kgr_model2_fit$param_plot
kgr_model2_fit$hyperparam_plot
pp_insample_plot(pred_data = kgr_model2_fit$fitted_values)
We can also simplify the covariance of our underlying GP and see how our proposed model compares with a simplified version with a simple time kernel:
\(\Lambda_{c,t} | \textbf{F} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{7} I \{ c=7 \} + \textbf{F}_{t})\)
where the graph signal \(\textbf{F} | \rho_{rbf}, \rho_{p}, \sigma^2_{time} \sim \mathcal{GP}(\textbf{0},\textbf{K}_{time})\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{time} \right]_{t_1,t_2}\).
Instead of calculating gram matrix K based on covariate (EPA variables) similarity, our gram matrix K is simply a time kernel where similar values of t (months 1-60) have larger covariances. As a result, this model has no spatial dependence structure built in.
time_kernel = function(time_span,rho_rbf,rho_periodic,sigma2){
K_time = matrix(NA,nrow = time_span, ncol = time_span)
for (i in 1:time_span){
for (j in 1:time_span){
# K_time[i,j] = exp(- (abs(i-j)^2) / (2*rho)) * sigma2
K_time[i,j] = exp(- (abs(i-j)^2) / (2*rho_rbf)) * exp(- (2*sin(sum(abs(i-j))*pi/12)^2)
/ (rho_periodic)) * sigma2
}
}
return(K_time)
}
Since there is no spatial component in this model, each cluster can be fit separately.
ref_model3 = function(dataset, cluster, rho_time_rbf = 1, rho_time_periodic = 1, sigma2_time = 1,link=1){
#Calculating gram matrix K_time
K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf,
rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
#Heatmap of resulting K
K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
#Calculate trace norm of gram matrix
K_time_weight = norm(K_time,type = "F")
#Need to ensure precision matrix is not computationally singular i.e det > 0
covGP_jittered = desingularize(K_time,threshold = 1e-2,increment = 0.01)
K_time = covGP_jittered[[1]]
inv_K_time = solve(K_time)
# cov_Fnorm = norm(K_time,type = "F")
#Heatmap of resulting inv_K_time
inv_K_time_heatmap = matrix_heatmap(inv_K_time,title = "")
###Fitting the model on each cluster
inla_test_clus_data = dataset %>% filter(id == cluster)
ref_formula3 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 + f(time,model = "generic0",Cmatrix = inv_K_time)
model = inla(ref_formula3, data = inla_test_clus_data, family = "poisson",
control.compute = list(dic=TRUE,waic=TRUE),
control.inla = list(strategy = "laplace"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(inla_test_clus_data$id, inla_test_clus_data$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
ref_model3_results = list(
K_time_heatmap = K_time_heatmap,
K_time_weight = K_time_weight/(K_time_weight + gfilter_weight),
gfilter_weight = gfilter_weight/(K_time_weight + gfilter_weight),
covmatrix = K_time,
prec = inv_K_time,
num_jitters = covGP_jittered[[2]],
prec_heatmap = inv_K_time_heatmap,
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model
)
return(ref_model3_results)
}
#Fit ref_model3 on one cluster (to test)
ref_model3_fit = ref_model3(dataset = inla_insample_data, cluster = 2, rho_time_rbf = 1,
rho_time_periodic = 1, sigma2_time = 5)
#Extract DIC and WAIC
ref_model3_DIC = ref_model3_fit$model_DIC
ref_model3_WAIC = ref_model3_fit$model_WAIC
#Get summaries of parameter estimates
ref_model3_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant
## Intercept1 5.339512e-15 31.62254149 -62.016688 -4.615441e-15 62.016688
## Intercept2 3.814799e+00 0.04576285 3.723701 3.815110e+00 3.904159
## Intercept3 5.339512e-15 31.62254149 -62.016688 -4.615441e-15 62.016688
## Intercept4 5.339512e-15 31.62254149 -62.016688 -4.615441e-15 62.016688
## Intercept5 5.339512e-15 31.62254149 -62.016688 -4.615441e-15 62.016688
## Intercept6 5.339512e-15 31.62254149 -62.016688 -4.615441e-15 62.016688
## Intercept7 5.339512e-15 31.62254149 -62.016688 -4.615441e-15 62.016688
## mode kld
## Intercept1 0.000000 5.527836e-11
## Intercept2 3.815725 1.152827e-08
## Intercept3 0.000000 5.527836e-11
## Intercept4 0.000000 5.527836e-11
## Intercept5 0.000000 5.527836e-11
## Intercept6 0.000000 5.527836e-11
## Intercept7 0.000000 5.527836e-11
ref_model3_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for time 0.08981029 0.01317363 0.06656563 0.08886275 0.1183234 0.08713069
ref_model3_fit$exp_effects
## Intercept1 Intercept2 Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 1.00000 45.36764 1.00000 1.00000 1.00000 1.00000 1.00000
ref_model3_fit$K_time_weight
## [1] 0.9949444
ref_model3_fit$gfilter_weight
## [1] 0.005055571
ref_model3_fit$num_jitters
## [1] 0
#Show plots
ref_model3_fit$prec_heatmap
ref_model3_fit$K_time_heatmap
ref_model3_fit$param_plot
ref_model3_fit$hyperparam_plot
test1 = ref_model3(dataset = inla_insample_data, cluster = 1, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802)
test2 = ref_model3(dataset = inla_insample_data, cluster = 2, rho_time_rbf = 498.918,
rho_time_periodic = 120.307, sigma2_time = 0.014)
test3 = ref_model3(dataset = inla_insample_data, cluster = 3, rho_time_rbf = 8.090,
rho_time_periodic = 468.170, sigma2_time = 2.428)
test4 = ref_model3(dataset = inla_insample_data, cluster = 4, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802) #why does sigma2 have to be so big here??
test5 = ref_model3(dataset = inla_insample_data, cluster = 5, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802)
test6 = ref_model3(dataset = inla_insample_data, cluster = 6, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802) #why does sigma2 have to be so big here??
test7 = ref_model3(dataset = inla_insample_data, cluster = 7, rho_time_rbf = 474.985,
rho_time_periodic = 2.050, sigma2_time = 1.489)
ref_model3_fvs = rbind(test1$fitted_values,test2$fitted_values,test3$fitted_values,
test4$fitted_values,test5$fitted_values,test6$fitted_values,
test7$fitted_values)
pp_insample_plot(num_plots = num_clus,ref_data = inla_insample_data,pred_data = ref_model3_fvs)
\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{7} I \{ c=7 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)
where the graph signal \(\textbf{F} | \textbf{S}, \rho_{rbf}, \rho_{p}, \sigma^2_{time} \sim \mathcal{GP}(\textbf{0},\textbf{K}_{time} \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{time} \right]_{t_1,t_2} \left[ \textbf{H}^2 \right]_{c_1,c_2}\).
kgr_model1 = function(dataset, rho_time_rbf = 1, rho_time_periodic = 1, sigma2_time = 1, link=1){
#Calculating gram matrix K_time
K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf,
rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
#Heatmap of resulting K
K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
#Calculate trace norm of gram matrix
K_time_weight = norm((1/60)*K_time,type = "F")
#Calculate proposed kernel
covGP2 = kronecker(K_time/60,(H^2)/7)
#Need to ensure precision matrix is not computationally singular i.e det > 0
covGP_jittered = desingularize(covGP2,threshold = 1e-2,increment = 0.01)
covGP2 = covGP_jittered[[1]]
inv_covGP2 = solve(covGP2)
#Heatmap of resulting inv_covGP2
inv_covGP2_heatmap = matrix_heatmap(inv_covGP2,title = "")
###Fit INLA model
# kgr_formula1 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
# Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP2)
kgr_formula1 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP2)
model = inla(formula = kgr_formula1,family = "poisson",data = dataset,
control.compute = list(dic=TRUE,waic=TRUE,
return.marginals.predictor=TRUE),
control.inla = list(strategy = "laplace"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(dataset$id, dataset$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
marginal_fvs <- model$marginals.fitted.values
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
kgr_model1_results = list(
K_time_heatmap = K_time_heatmap,
K_time_weight = K_time_weight/(K_time_weight + gfilter_weight),
gfilter_weight = gfilter_weight/(K_time_weight + gfilter_weight),
covmatrix = covGP2,
prec = inv_covGP2,
num_jitters = covGP_jittered[[2]],
prec_heatmap = inv_covGP2_heatmap,
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model,
marg_fitted_values = marginal_fvs
)
return(kgr_model1_results)
}
#Fit kgr_model1
kgr_model1_fit = kgr_model1(dataset = inla_insample_data,rho_time_rbf = 63.888,rho_time_periodic = 5.845,sigma2_time = 0.720)
#Extract DIC and WAIC
kgr_model1_DIC = kgr_model1_fit$model_DIC
kgr_model1_WAIC = kgr_model1_fit$model_WAIC
#Get summaries of parameter estimates
kgr_model1_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0980138 7.254728 -12.129629 2.0980138 16.32566 2.0980138
## months2 1.8695839 7.254730 -12.358062 1.8695839 16.09723 1.8695839
## months3 1.8914904 7.254730 -12.336156 1.8914904 16.11914 1.8914904
## months4 1.6970299 7.254732 -12.530620 1.6970299 15.92468 1.6970299
## months5 1.6170623 7.254733 -12.610589 1.6170623 15.84471 1.6170623
## months6 1.4933315 7.254734 -12.734323 1.4933315 15.72099 1.4933315
## months7 1.4550675 7.254741 -12.772600 1.4550675 15.68273 1.4550675
## months8 1.4333935 7.254741 -12.794274 1.4333935 15.66106 1.4333935
## months9 1.3922574 7.254742 -12.835412 1.3922574 15.61993 1.3922574
## months10 1.4728782 7.254740 -12.754788 1.4728782 15.70054 1.4728782
## months11 1.5157025 7.254740 -12.711963 1.5157025 15.74337 1.5157025
## months12 1.7749705 7.254736 -12.452688 1.7749705 16.00263 1.7749705
## Intercept1 1.8604875 7.254750 -12.367197 1.8604875 16.08817 1.8604875
## Intercept2 2.1517188 7.254743 -12.075952 2.1517188 16.37939 2.1517188
## Intercept3 2.0736398 7.254749 -12.154043 2.0736398 16.30132 2.0736398
## Intercept4 4.2849911 7.254724 -9.942644 4.2849911 18.51263 4.2849911
## Intercept5 3.6474254 7.254729 -10.580219 3.6474254 17.87507 3.6474254
## Intercept6 0.6824637 7.254833 -13.545383 0.6824637 14.91031 0.6824637
## Intercept7 5.0100552 7.254718 -9.217567 5.0100552 19.23768 5.0100552
## kld
## months1 5.527848e-11
## months2 5.527833e-11
## months3 5.527832e-11
## months4 5.527843e-11
## months5 5.527842e-11
## months6 5.527853e-11
## months7 5.527843e-11
## months8 5.527829e-11
## months9 5.527842e-11
## months10 5.527844e-11
## months11 5.527845e-11
## months12 5.527836e-11
## Intercept1 5.527829e-11
## Intercept2 5.527854e-11
## Intercept3 5.527832e-11
## Intercept4 5.527841e-11
## Intercept5 5.527834e-11
## Intercept6 5.527852e-11
## Intercept7 5.527824e-11
kgr_model1_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.6336751 0.04940331 0.5412722 0.6319559 0.735312 0.6286969
kgr_model1_fit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.149967 6.485597 6.629241 5.457713 5.038268 4.451902 4.284773
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.192904 4.023923 4.361771 4.552618 5.900107 6.426869 8.599627
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.953721 72.601897 38.375735 1.978747 149.913011
kgr_model1_fit$K_time_weight
## [1] 0.5390401
kgr_model1_fit$gfilter_weight
## [1] 0.4609599
kgr_model1_fit$num_jitters
## [1] 1
#Show plots
kgr_model1_fit$K_time_heatmap
kgr_model1_fit$prec_heatmap
kgr_model1_fit$param_plot
kgr_model1_fit$hyperparam_plot
pp_insample_plot(pred_data = kgr_model1_fit$fitted_values)
Finally, we could also increase the complexity of our proposed model by including our time kernel in the covariance structure of the underlying GP. Notice that K does not explicitly have a temporal dependence structure; instead, it represents EPA covariate similarity compared across months. We can explicitly include the time kernel above by either element wise adding or multiplying K_EPA and K_time together before taking the kronecker product with \(H^2\)
\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{7} I \{ c=7 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)
where the graph signal \(\textbf{F} | \textbf{S}, \rho^{time}_{rbf}, \rho^{time}_{p}, \rho^{EPA}_{rbf}, \rho^{EPA}_{p}, \sigma^2 \sim \mathcal{GP}(\textbf{0},(\textbf{K}_{time} \odot \textbf{K}_{EPA}) \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \left[ \textbf{K}^{time} \right]_{t_1,t_2} \left[ \textbf{K}^{EPA} \right]_{t_1,t_2} \left[ \textbf{H}^2 \right]_{c_1,c_2}\).
kgr_model3 = function(dataset,rho_EPA_rbf = 1,rho_EPA_periodic = 1,rho_time_rbf = 1,rho_time_periodic = 1,sigma2 = 1,link=1){
###Calculating gram matrix K_EPA
K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
rho_rbf = rho_EPA_rbf,rho_periodic = rho_EPA_periodic,sigma2 = 1)
#Heatmap of resulting K
K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA heatmap")
###Calculating gram matrix K_time
K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf,
rho_periodic = rho_time_periodic, sigma2 = sigma2)
#Heatmap of resulting K
K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
K_EPA_norm = norm(K_EPA,type = "F")
K_time_norm = norm(K_time,type = "F")
#Calculate trace norm of gram matrix
gram = (K_EPA*K_time)/sigma2
K_weight = norm((1/60)*gram,type = "F")
###Load graph regression kernel
# covGP3 = kronecker(gram,H^2)
covGP3 = kronecker(gram/60,(H^2)/7)
#Need to ensure precision matrix is not computationally singular i.e det > 0
covGP_jittered = desingularize(covGP3,threshold = 1e-2,increment = 0.01)
covGP3 = covGP_jittered[[1]]
inv_covGP3 = solve(covGP3)
#Heatmap of resulting K
inv_covGP3_heatmap = matrix_heatmap(inv_covGP3,title = "")
###Fit INLA model
# kgr_formula3 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
# Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP3)
kgr_formula3 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP3)
model = inla(formula = kgr_formula3,family = "poisson",data = dataset,
control.compute = list(dic=TRUE,waic=TRUE,
return.marginals.predictor=TRUE),
control.inla = list(strategy = "gaussian"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(dataset$id, dataset$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
marginal_fvs <- model$marginals.fitted.values
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
kgr_model3_results = list(
K_EPA_heatmap = K_EPA_heatmap,
K_time_heatmap = K_time_heatmap,
K_EPA_weight = K_EPA_norm / (K_EPA_norm + K_time_norm),
K_time_weight = K_time_norm / (K_EPA_norm + K_time_norm),
K_weight = K_weight/(K_weight + gfilter_weight),
gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
gram_matrix = gram,
covmatrix = covGP3,
prec = inv_covGP3,
num_jitters = covGP_jittered[[2]],
prec_heatmap = inv_covGP3_heatmap,
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model,
marg_fitted_values = marginal_fvs
)
return(kgr_model3_results)
}
#Fit kgr_model3
kgr_model3_fit = kgr_model3(dataset = inla_insample_data, rho_EPA_rbf = 77.429, rho_EPA_periodic = 5164.252,
rho_time_rbf = 6802.120, rho_time_periodic = 8977.554, sigma2 = 4.997)
#Extract DIC and WAIC
kgr_model3_DIC = kgr_model3_fit$model_DIC
kgr_model3_WAIC = kgr_model3_fit$model_WAIC
#Get summaries of parameter estimates
kgr_model3_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0982597 7.254730 -12.129385 2.0982597 16.32590 2.0982597
## months2 1.8695316 7.254731 -12.358117 1.8695316 16.09718 1.8695316
## months3 1.8912210 7.254731 -12.336427 1.8912210 16.11887 1.8912210
## months4 1.6963041 7.254733 -12.531348 1.6963041 15.92396 1.6963041
## months5 1.6160438 7.254734 -12.611610 1.6160438 15.84370 1.6160438
## months6 1.4917139 7.254735 -12.735943 1.4917139 15.71937 1.4917139
## months7 1.4551478 7.254742 -12.772522 1.4551478 15.68282 1.4551478
## months8 1.4338531 7.254742 -12.793817 1.4338531 15.66152 1.4338531
## months9 1.3925640 7.254743 -12.835108 1.3925640 15.62024 1.3925640
## months10 1.4734486 7.254742 -12.754221 1.4734486 15.70112 1.4734486
## months11 1.5165188 7.254741 -12.711149 1.5165188 15.74419 1.5165188
## months12 1.7759331 7.254738 -12.451728 1.7759331 16.00359 1.7759331
## Intercept1 1.8604754 7.254759 -12.367228 1.8604754 16.08818 1.8604754
## Intercept2 2.1517427 7.254759 -12.075960 2.1517427 16.37945 2.1517427
## Intercept3 2.0742013 7.254782 -12.153546 2.0742013 16.30195 2.0742013
## Intercept4 4.2846693 7.254756 -9.943027 4.2846693 18.51237 4.2846693
## Intercept5 3.6470121 7.254770 -10.580712 3.6470121 17.87474 3.6470121
## Intercept6 0.6822521 7.254877 -13.545683 0.6822521 14.91019 0.6822521
## Intercept7 5.0101865 7.254727 -9.217455 5.0101865 19.23783 5.0101865
## kld
## months1 5.527846e-11
## months2 5.527844e-11
## months3 5.527845e-11
## months4 5.527841e-11
## months5 5.527827e-11
## months6 5.527825e-11
## months7 5.527828e-11
## months8 5.527840e-11
## months9 5.527840e-11
## months10 5.527828e-11
## months11 5.527843e-11
## months12 5.527821e-11
## Intercept1 5.527828e-11
## Intercept2 5.527842e-11
## Intercept3 5.527848e-11
## Intercept4 5.527834e-11
## Intercept5 5.527824e-11
## Intercept6 5.527824e-11
## Intercept7 5.527837e-11
kgr_model3_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.6520788 0.05115156 0.5563718 0.6503108 0.7572794 0.6469547
kgr_model3_fit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.151970 6.485258 6.627456 5.453754 5.033139 4.444707 4.285117
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.194831 4.025157 4.364260 4.556336 5.905789 6.426791 8.599832
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.958188 72.578540 38.359879 1.978328 149.932694
kgr_model3_fit$K_EPA_weight
## [1] 0.1699687
kgr_model3_fit$K_time_weight
## [1] 0.8300313
kgr_model3_fit$K_weight
## [1] 0.7864507
kgr_model3_fit$gfilter_weight
## [1] 0.2135493
kgr_model3_fit$num_jitters
## [1] 1
#Show plots
kgr_model3_fit$K_time_heatmap
kgr_model3_fit$K_EPA_heatmap
kgr_model3_fit$prec_heatmap
kgr_model3_fit$param_plot
kgr_model3_fit$hyperparam_plot
pp_insample_plot(pred_data = kgr_model3_fit$fitted_values)
\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{7} I \{ c=7 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)
where the graph signal \(\textbf{F} | \textbf{S}, \rho^{time}_{rbf}, \rho^{time}_{p}, \rho^{EPA}_{rbf}, \rho^{EPA}_{p}, \sigma^2_{time}, \sigma^2_{EPA} \sim \mathcal{GP}(\textbf{0},(\frac{1}{2}(\textbf{K}_{time} + \textbf{K}_{EPA}) \otimes \textbf{H}^2)\) with \(Cov(F_{c_1,t_1},F_{c_2,t_2}) = \frac{1}{2} \left( \left[ \textbf{K}^{time} \right]_{t_1,t_2} + \left[ \textbf{K}^{EPA} \right]_{t_1,t_2} \right) \left[ \textbf{H}^2 \right]_{c_1,c_2}\).
kgr_model4 = function(dataset,rho_EPA_rbf = 1, rho_EPA_periodic = 1,
rho_time_rbf = 1, rho_time_periodic = 1, sigma2_EPA = 1, sigma2_time = 3,link = 1){
###Calculating gram matrix K_EPA
K_EPA = EPA_kernel(time_span = length(unique(dataset$time)),
rho_rbf = rho_EPA_rbf,rho_periodic = rho_EPA_periodic,sigma2 = 1)
#Heatmap of resulting K
K_EPA_heatmap = matrix_heatmap(K_EPA,title = "K_EPA heatmap")
###Calculating gram matrix K_time
K_time = time_kernel(time_span = length(unique(dataset$time)),rho_rbf = rho_time_rbf,
rho_periodic = rho_time_periodic, sigma2 = sigma2_time)
#Heatmap of resulting K
K_time_heatmap = matrix_heatmap(K_time,title = "K_time heatmap")
K_EPA_norm = norm(K_EPA,type = "F")
K_time_norm = norm(K_time,type = "F")
gram = 0.5*(K_time+K_EPA)
K_weight = norm((1/60)*gram,type = "F")
###Load graph regression kernel
# covGP4 = kronecker(gram,(H^2))
covGP4 = kronecker(gram/60,(H^2)/7)
#Need to ensure precision matrix is not computationally singular i.e det > 0
covGP_jittered = desingularize(covGP4,threshold = 1e-2,increment = 0.01)
covGP4 = covGP_jittered[[1]]
inv_covGP4 = solve(covGP4)
#Heatmap of resulting K
inv_covGP4_heatmap = matrix_heatmap(inv_covGP4,title = "")
###Fit INLA model
# kgr_formula4 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
# Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP4)
kgr_formula4 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP4)
model = inla(formula = kgr_formula4,family = "poisson",data = dataset,
control.compute = list(dic=TRUE,waic=TRUE,
return.marginals.predictor=TRUE),
control.inla = list(strategy = "laplace"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(dataset$id, dataset$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
marginal_fvs <- model$marginals.fitted.values
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
kgr_model4_results = list(
K_EPA_heatmap = K_EPA_heatmap,
K_time_heatmap = K_time_heatmap,
K_EPA_weight = K_EPA_norm / (K_EPA_norm + K_time_norm),
K_time_weight = K_time_norm / (K_EPA_norm + K_time_norm),
K_weight = K_weight/(K_weight + gfilter_weight),
gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
gram_matrix = gram,
covmatrix = covGP4,
prec = inv_covGP4,
num_jitters = covGP_jittered[[2]],
prec_heatmap = inv_covGP4_heatmap,
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model,
marg_fitted_values = marginal_fvs
)
return(kgr_model4_results)
}
#Fit kgr_model4
kgr_model4_fit = kgr_model4(dataset = inla_insample_data, rho_EPA_rbf = 259.326, rho_EPA_periodic = 246.306,
rho_time_rbf = 2.699, rho_time_periodic = 86.005, sigma2_EPA = 3.215, sigma2_time = 3.583)
#Extract DIC and WAIC
kgr_model4_DIC = kgr_model4_fit$model_DIC
kgr_model4_WAIC = kgr_model4_fit$model_WAIC
#Get summaries of parameter estimates
kgr_model4_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0962547 7.254729 -12.131389 2.0962547 16.32390 2.0962547
## months2 1.8678684 7.254731 -12.359779 1.8678684 16.09552 1.8678684
## months3 1.8905177 7.254731 -12.337129 1.8905177 16.11816 1.8905177
## months4 1.6974479 7.254732 -12.530203 1.6974479 15.92510 1.6974479
## months5 1.6173799 7.254733 -12.610272 1.6173799 15.84503 1.6173799
## months6 1.4938692 7.254735 -12.733786 1.4938692 15.72152 1.4938692
## months7 1.4578810 7.254741 -12.769787 1.4578810 15.68555 1.4578810
## months8 1.4355940 7.254742 -12.792075 1.4355940 15.66326 1.4355940
## months9 1.3948558 7.254742 -12.832814 1.3948558 15.62253 1.3948558
## months10 1.4742747 7.254741 -12.753393 1.4742747 15.70194 1.4742747
## months11 1.5158389 7.254740 -12.711827 1.5158389 15.74351 1.5158389
## months12 1.7741877 7.254737 -12.453472 1.7741877 16.00185 1.7741877
## Intercept1 1.8609587 7.254753 -12.366733 1.8609587 16.08865 1.8609587
## Intercept2 2.1518844 7.254749 -12.075800 2.1518844 16.37957 2.1518844
## Intercept3 2.0750270 7.254763 -12.152684 2.0750270 16.30274 2.0750270
## Intercept4 4.2865213 7.254738 -9.941140 4.2865213 18.51418 4.2865213
## Intercept5 3.6470707 7.254747 -10.580608 3.6470707 17.87475 3.6470707
## Intercept6 0.6836237 7.254852 -13.544262 0.6836237 14.91151 0.6836237
## Intercept7 5.0108840 7.254721 -9.216745 5.0108840 19.23851 5.0108840
## kld
## months1 5.527849e-11
## months2 5.527845e-11
## months3 5.527846e-11
## months4 5.527843e-11
## months5 5.527841e-11
## months6 5.527839e-11
## months7 5.527829e-11
## months8 5.527842e-11
## months9 5.527841e-11
## months10 5.527830e-11
## months11 5.527844e-11
## months12 5.527849e-11
## Intercept1 5.527824e-11
## Intercept2 5.527831e-11
## Intercept3 5.527836e-11
## Intercept4 5.527821e-11
## Intercept5 5.527836e-11
## Intercept6 5.527849e-11
## Intercept7 5.527846e-11
kgr_model4_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.5982368 0.0461085 0.5120096 0.596627 0.6931101 0.593578
kgr_model4_fit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.135642 6.474481 6.622797 5.459995 5.039868 4.454297 4.296845
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.202140 4.034393 4.367866 4.553239 5.895490 6.429898 8.601051
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.964762 72.713084 38.362129 1.981043 150.037313
kgr_model4_fit$K_EPA_weight
## [1] 0.5464999
kgr_model4_fit$K_time_weight
## [1] 0.4535001
kgr_model4_fit$K_weight
## [1] 0.740457
kgr_model4_fit$gfilter_weight
## [1] 0.259543
kgr_model4_fit$num_jitters
## [1] 1
#Show plots
kgr_model4_fit$K_time_heatmap
kgr_model4_fit$K_EPA_heatmap
kgr_model4_fit$prec_heatmap
kgr_model4_fit$param_plot
kgr_model4_fit$hyperparam_plot
pp_insample_plot(pred_data = kgr_model4_fit$fitted_values)
\(\Lambda_{c,t} | \textbf{F}, \textbf{S} = \exp(\beta_{c1} I \{ c=1 \} + \beta_{c2} I \{ c=2 \} + ... + \beta_{7} I \{ c=7 \} + \beta_1 I \{t \, mod \, 12 = 1 \} + ... + \beta_{11} I \{t \, mod \, 12 = 11 \} + \textbf{F}_{c,t})\)
where the graph signal \(\textbf{F} | \textbf{S}, \rho^{AR}_{rbf}, \rho^{AR}_{p}, \rho^{DL}_{rbf}, \rho^{DL}_{p}, \rho^{Int}_{rbf}, \rho^{Int}_{p}, \sigma^2_{AR}, \sigma^2_{DL} , \sigma^2_{Int} \sim \mathcal{GP}(\textbf{0},((\frac{1}{3} \textbf{K}_{AR} + \frac{1}{3} \textbf{K}_{DL} + \frac{1}{3} \textbf{K}_{Interaction}) \otimes \textbf{H}^2)\) with \(Cov[F_{n_1,t_1},F_{n_2,t_2}] = (k(t_1,t_2)+k(x_{t_1},x_{t_2}))(H^2)_{n_1,n_2}\).
kgr_model5 = function(dataset, rho_AR_rbf = 1, rho_AR_periodic = 1, rho_DL_rbf = 1, rho_DL_periodic = 1,
rho_int_rbf = 1, rho_int_periodic = 1, sigma2_AR = 1, sigma2_DL = 1, sigma2_int = 1, link=1){
#Calculating gram matrix K_AR
K_AR_cluster = list()
K_AR_periodic_cluster = list()
for (c in 1:num_clus){
#Grab S_random data for cluster c
cluster_data = decomposed_cluster_data[[c]]
S_random_clus = cluster_data$S_random
#Create a list to contain covariance matrix for each pollutant (8)
K_AR_list = list()
K_AR_periodic_list = list()
time_span = nrow(S_random_clus)
#Calculate a AR 1 covariance matrix for each pollutant and store in list
for (i in 1:8){
ts = S_random_clus[,i]
K_covariate = matrix(nrow=time_span,ncol=time_span)
K_covariate_periodic = matrix(nrow=time_span,ncol=time_span)
for(j in 1:time_span){
for (k in 1:time_span){
if (abs(j-k) <= 1){
K_covariate[j,k] = exp(- ((ts[j] - ts[k])^2) #RBF kernel
/ (2*rho_AR_rbf)) * sigma2_AR
K_covariate_periodic[j,k] = exp(- ((ts[j] - ts[k])^2) #Locally periodic kernel
/ (2*rho_AR_rbf)) * exp(- (2*sin((abs(ts[j] - ts[k]))*pi/12)^2)
/ (rho_AR_periodic)) * sigma2_AR
}
else{
K_covariate_periodic[j,k] = 0
K_covariate[j,k] = 0
}
}
}
K_AR_list[[i]] = K_covariate
K_AR_periodic_list[[i]] = K_covariate_periodic
}
names(K_AR_list) = colnames(S_random_clus)
names(K_AR_periodic_list) = colnames(S_random_clus)
#Add each pollutant's covariance matrix to get AR 1 matrix for each cluster
K_AR = matrix(0,nrow=60,ncol=60)
K_AR_periodic = matrix(0,nrow=60,ncol=60)
for(i in 1:length(K_AR_periodic_list)){
K_AR = K_AR + ((1/8)*K_AR_list[[i]])
K_AR_periodic = K_AR_periodic + ((1/8)*K_AR_periodic_list[[i]])
}
K_AR_cluster[[c]] = K_AR
K_AR_periodic_cluster[[c]] = K_AR_periodic
}
K_AR = matrix(0,nrow=60,ncol=60)
K_AR_periodic = matrix(0,nrow=60,ncol=60)
for(i in 1:num_clus){
K_AR = K_AR + ((1/num_clus)*K_AR_cluster[[i]])
K_AR_periodic = K_AR_periodic + ((1/num_clus)*K_AR_periodic_cluster[[i]])
}
K_AR_norm = norm(K_AR_periodic,type = "F")
#Heatmap of resulting K
# K_AR_heatmap = corrplot(K_AR, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "AR 1 Covariance Structure")
# K_AR_heatmap = corrplot(K_AR_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic AR 1 Covariance Structure")
# K_AR_heatmap = matrix_heatmap(K_AR,title = "AR 1 Covariance Structure")
K_AR_heatmap = matrix_heatmap(K_AR_periodic,title = "Periodic AR 1 Covariance Structure")
###Calculating gram matrix K_DL
K_DL_cluster = list()
K_DL_periodic_cluster = list()
for (c in 1:num_clus){
#Grab S_DL data for cluster c
cluster_data = decomposed_cluster_data[[c]]
S_DL_clus = cluster_data$S_DL
#Create a list to store covariance matrix for each DL
K_DL_list = list()
K_DL_periodic_list = list()
dl_lags = c(3,6,12)
tracker = 1
for (i in dl_lags){
K_DL = matrix(nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(nrow=time_span,ncol=time_span)
#Calculate DL covariance matrix for specified lag
for(j in 1:nrow(S_DL_clus)){
for (k in 1:nrow(S_DL_clus)){
if ((abs(j-k) == 0) || (abs(j-k) == i)){
K_DL[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2) / (2*rho_DL_rbf)) * sigma2_DL
K_DL_periodic[j,k] = exp(- (sum(S_DL_clus[j,] - S_DL_clus[k,])^2)
/ (2*rho_DL_rbf)) * exp(- (2*sin(sum(abs(S_DL_clus[j,] - S_DL_clus[k,]))*pi/12)^2)
/ (rho_DL_periodic)) * sigma2_DL
}
else{
K_DL_periodic[j,k] = 0
K_DL[j,k] = 0
}
}
}
K_DL_list[[tracker]] = K_DL
K_DL_periodic_list[[tracker]] = K_DL_periodic
tracker = tracker+1
}
#Combine the 3 DL covariance matrices together
K_DL = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
for(i in 1:length(K_DL_periodic_list)){
K_DL = K_DL + ((1/3)*K_DL_list[[i]])
K_DL_periodic = K_DL_periodic + ((1/3)*K_DL_periodic_list[[i]])
}
#Store DL(3,6,12) covariance matrix for each cluster
K_DL_cluster[[c]] = K_DL
K_DL_periodic_cluster[[c]] = K_DL
}
K_DL = matrix(0,nrow=time_span,ncol=time_span)
K_DL_periodic = matrix(0,nrow=time_span,ncol=time_span)
for(i in 1:num_clus){
K_DL = K_DL + ((1/num_clus)*K_DL_cluster[[i]])
K_DL_periodic = K_DL_periodic + ((1/num_clus)*K_DL_periodic_cluster[[i]])
}
K_DL_norm = norm(K_DL_periodic,type = "F")
#Heatmap of resulting K
# K_DL_heatmap = corrplot(K_DL, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10),
# title = "DL (3,6,12) Covariance Structure")
# K_DL_heatmap = corrplot(K_DL_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic DL (3,6,12) Covariance Structure")
# K_DL_heatmap = matrix_heatmap(K_DL,title = "DL (3,6,12) Covariance Structure")
K_DL_heatmap = matrix_heatmap(K_DL_periodic,title = "Periodic DL (3,6,12) Covariance Structure")
###Calculating gram matrix K_Interaction
K_Interaction_cluster = list()
K_Interaction_periodic_cluster = list()
for (c in 1:num_clus){
#Grab interaction pair data for cluster c
cluster_data = decomposed_cluster_data[[c]]
W2_clus = cluster_data$W2
K_interaction_list = list()
K_interaction_periodic_list = list()
column_names = colnames(W2_clus)
time_span = nrow(W2_clus)
#Create sequence of indices corresponding to comparisons for real time and one lag interaction effects
lag0_idx = seq(2,3601,by=61)
lag1_idx = seq(1,3600,by=61)
#Calculate a kernel for each interaction pair
for (a in 1:length(column_names)){
interaction = W2_clus[,a]
#First calculate these two interaction kernels separately
K_int0 = matrix(nrow = 60,ncol = 60)
K_int1 = matrix(nrow = 60,ncol = 60)
K_int0_periodic = matrix(nrow = 60,ncol = 60)
K_int1_periodic = matrix(nrow = 60,ncol = 60)
for (i in 1:60){
for (j in 1:60){
#RBF kernels
K_int0[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
/ (2*rho_int_rbf)) * sigma2_int
K_int1[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
/ (2*rho_int_rbf)) * sigma2_int
#Locally periodic kernels
K_int0_periodic[i,j] = exp(- ((interaction[lag0_idx[i]] - interaction[lag0_idx[j]])^2)
/ (2*rho_int_rbf)) *
exp(- (2*sin((abs(interaction[lag0_idx[i]] - interaction[lag0_idx[j]]))*pi/12)^2)
/ (rho_int_periodic)) * sigma2_int
K_int1_periodic[i,j] = exp(- ((interaction[lag1_idx[i]] - interaction[lag1_idx[j]])^2)
/ (2*rho_int_rbf)) *
exp(- (2*sin((abs(interaction[lag1_idx[i]] - interaction[lag1_idx[j]]))*pi/12)^2)
/ (rho_int_periodic)) * sigma2_int
}
}
#Combine real time and one lag interaction kernels together
K_interaction = 0.5*K_int0 + 0.5*K_int1
K_interaction_list[[a]] = K_interaction
K_interaction_periodic = 0.5*K_int0_periodic + 0.5*K_int1_periodic
K_interaction_periodic_list[[a]] = K_interaction_periodic
}
#Combine kernels for each interaction pair together
K_interaction = matrix(0,nrow=60,ncol=60)
K_interaction_periodic = matrix(0,nrow=60,ncol=60)
for(i in 1:length(K_interaction_periodic_list)){
K_interaction = K_interaction + ((1/length(K_interaction_list))*K_interaction_list[[i]])
K_interaction_periodic = K_interaction_periodic + ((1/length(K_interaction_periodic_list))*K_interaction_periodic_list[[i]])
}
#Store final interaction kernel (for all pairs) for each cluster
K_Interaction_cluster[[c]] = K_interaction
K_Interaction_periodic_cluster[[c]] = K_interaction_periodic
}
K_interaction = matrix(0,nrow=60,ncol=60)
K_interaction_periodic = matrix(0,nrow=60,ncol=60)
for(i in 1:num_clus){
K_interaction = K_interaction + ((1/length(K_Interaction_cluster))*K_Interaction_cluster[[i]])
K_interaction_periodic = K_interaction_periodic + ((1/length(K_Interaction_periodic_cluster))*K_Interaction_periodic_cluster[[i]])
}
K_Int_norm = norm(K_interaction_periodic,type = "F")
#Heatmap of resulting K
# K_Interaction_heatmap = corrplot(K_interaction, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Interaction Covariance Structure")
# K_Interaction_heatmap = corrplot(K_interaction_periodic, order = 'original', cl.pos = 'b', tl.pos = 'n',method = "color", col = COL1('YlOrRd',10), title = "Periodic Interaction Covariance Structure")
# K_Interaction_heatmap = matrix_heatmap(K_interaction,title = "Interaction Covariance Structure")
K_Interaction_heatmap = matrix_heatmap(K_interaction_periodic,title = "Periodic Interaction Covariance Structure")
gram = (1/3)*(K_AR_periodic+K_DL_periodic+K_interaction_periodic)
K_weight = norm((1/60)*gram,type = "F")
###Load graph regression kernel
# covGP5 = kronecker(gram,H^2)
covGP5 = kronecker(gram/60,(H^2)/7)
#Need to ensure precision matrix is not computationally singular i.e det > 0
covGP_jittered = desingularize(covGP5,threshold = 1e-2,increment = 0.01)
covGP5 = covGP_jittered[[1]]
inv_covGP5 = solve(covGP5)
#Heatmap of resulting K
inv_covGP5_heatmap = matrix_heatmap(inv_covGP5,title = "")
###Fit INLA model
# kgr_formula5 = response ~ -1 + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
# Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP5)
kgr_formula5 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 +
Intercept5 + Intercept6 + Intercept7 + f(id2,model = "generic0",Cmatrix = inv_covGP5)
model = inla(formula = kgr_formula5,family = "poisson",data = dataset,
control.compute = list(dic=TRUE,waic=TRUE,
return.marginals.predictor=TRUE),
control.inla = list(strategy = "laplace"),
control.predictor = list(compute = TRUE, link = link))
###Extract relevant information and store in the list
model_summary <- model$summary.fixed
bri_hyperpar_summary <- bri.hyperpar.summary(model)
model_DIC <- model$dic$dic
model_WAIC <- model$waic$waic
preds_model <- model$summary.fitted.values
preds_model <- cbind(dataset$id, dataset$time, preds_model)
colnames(preds_model) <- c("id", "time", "mean", "sd", "0.025quant", "0.5quant", "0.975quant", "mode")
marginal_fvs <- model$marginals.fitted.values
#Exponentiating parameter to get better interpretation of estimates
multeff <- exp(model$summary.fixed$mean)
names(multeff) <- model$names.fixed
#Plot of each parameters' posterior density
mf <- melt(model$marginals.fixed)
cf <- spread(mf,Var2,value)
names(cf)[2] <- 'parameter'
param_plot = ggplot(cf,aes(x=x,y=y)) + geom_line()+facet_wrap(~ parameter,
scales="free") + geom_vline(xintercept=0) + ylab("density")
#Plot of precision of random effect (main hyperparameter of interest)
sden <- data.frame(bri.hyper.sd(model$marginals.hyperpar[[1]]))
hyperparam_plot = ggplot(sden,aes(x,y)) + geom_line() + ylab("density") +
xlab("linear predictor")
#Store the results in the list
kgr_model5_results = list(
K_AR_heatmap = K_AR_heatmap,
K_DL_heatmap = K_DL_heatmap,
K_Interaction_heatmap = K_Interaction_heatmap,
K_AR_weight = K_AR_norm / (K_AR_norm + K_DL_norm + K_Int_norm),
K_DL_weight = K_DL_norm / (K_AR_norm + K_DL_norm + K_Int_norm),
K_Int_weight = K_Int_norm / (K_AR_norm + K_DL_norm + K_Int_norm),
K_weight = K_weight/(K_weight + gfilter_weight),
gfilter_weight = gfilter_weight/(K_weight + gfilter_weight),
gram_matrix = gram,
covmatrix = covGP5,
prec = inv_covGP5,
num_jitters = covGP_jittered[[2]],
prec_heatmap = inv_covGP5_heatmap,
model_summary = model_summary,
bri_hyperpar_summary = bri_hyperpar_summary,
exp_effects = multeff,
param_plot = param_plot,
hyperparam_plot = hyperparam_plot,
model_DIC = model_DIC,
model_WAIC = model_WAIC,
fitted_values = preds_model,
marg_fitted_values = marginal_fvs
)
return(kgr_model5_results)
}
#Fit kgr_model5
kgr_model5_fit = kgr_model5(dataset = inla_insample_data, rho_AR_rbf = 0.016, rho_AR_periodic = 0.019,
rho_DL_rbf = 0.013, rho_DL_periodic = 0.003, rho_int_rbf = 0.002,
rho_int_periodic = 0.011, sigma2_AR = 4.002, sigma2_DL = 0.486, sigma2_int = 3.021, link=1)
#Extract DIC and WAIC
kgr_model5_DIC = kgr_model5_fit$model_DIC
kgr_model5_WAIC = kgr_model5_fit$model_WAIC
#Get summaries of parameter estimates
kgr_model5_fit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0953297 7.254730 -12.132316 2.0953297 16.32298 2.0953297
## months2 1.8677250 7.254731 -12.359924 1.8677250 16.09537 1.8677250
## months3 1.8913750 7.254731 -12.336273 1.8913750 16.11902 1.8913750
## months4 1.6974139 7.254733 -12.530238 1.6974139 15.92507 1.6974139
## months5 1.6169277 7.254734 -12.610726 1.6169277 15.84458 1.6169277
## months6 1.4929671 7.254735 -12.734689 1.4929671 15.72062 1.4929671
## months7 1.4582942 7.254742 -12.769376 1.4582942 15.68596 1.4582942
## months8 1.4358553 7.254742 -12.791815 1.4358553 15.66353 1.4358553
## months9 1.3937543 7.254743 -12.833917 1.3937543 15.62143 1.3937543
## months10 1.4741209 7.254741 -12.753547 1.4741209 15.70179 1.4741209
## months11 1.5174900 7.254741 -12.710177 1.5174900 15.74516 1.5174900
## months12 1.7748719 7.254738 -12.452789 1.7748719 16.00253 1.7748719
## Intercept1 1.8605094 7.254750 -12.367176 1.8605094 16.08819 1.8605094
## Intercept2 2.1515808 7.254744 -12.076093 2.1515808 16.37925 2.1515808
## Intercept3 2.0712805 7.254752 -12.156408 2.0712805 16.29897 2.0712805
## Intercept4 4.2889445 7.254727 -9.938695 4.2889445 18.51658 4.2889445
## Intercept5 3.6581731 7.254733 -10.569478 3.6581731 17.88582 3.6581731
## Intercept6 0.6781006 7.254837 -13.549755 0.6781006 14.90596 0.6781006
## Intercept7 5.0075362 7.254718 -9.220087 5.0075362 19.23516 5.0075362
## kld
## months1 5.527847e-11
## months2 5.527831e-11
## months3 5.527831e-11
## months4 5.527842e-11
## months5 5.527827e-11
## months6 5.527839e-11
## months7 5.527841e-11
## months8 5.527827e-11
## months9 5.527840e-11
## months10 5.527843e-11
## months11 5.527843e-11
## months12 5.527848e-11
## Intercept1 5.527829e-11
## Intercept2 5.527838e-11
## Intercept3 5.527841e-11
## Intercept4 5.527851e-11
## Intercept5 5.527854e-11
## Intercept6 5.527859e-11
## Intercept7 5.527840e-11
kgr_model5_fit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.5871699 0.04499297 0.5029716 0.5856181 0.6796974 0.5826719
kgr_model5_fit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.128120 6.473553 6.628477 5.459809 5.037590 4.450280 4.298620
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.203238 4.029951 4.367195 4.560764 5.899525 6.427010 8.598440
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.934977 72.889492 38.790413 1.970132 149.535860
kgr_model5_fit$K_AR_weight
## [1] 0.3121421
kgr_model5_fit$K_DL_weight
## [1] 0.03542519
kgr_model5_fit$K_Int_weight
## [1] 0.6524327
kgr_model5_fit$K_weight
## [1] 0.6824774
kgr_model5_fit$gfilter_weight
## [1] 0.3175226
kgr_model5_fit$num_jitters
## [1] 1
#Show plots
kgr_model5_fit$K_AR_heatmap
kgr_model5_fit$K_DL_heatmap
kgr_model5_fit$K_Interaction_heatmap
kgr_model5_fit$prec_heatmap
kgr_model5_fit$param_plot
kgr_model5_fit$hyperparam_plot
pp_insample_plot(pred_data = kgr_model5_fit$fitted_values)
With INLA, we can obtain the deviance information criterion (DIC) and the widely appliciable (or Watanabe-Akaike) information criterion (WAIC) which have the following formulas:
\(DIC = \bar D + p_D\) where the first term is the posterior mean deviance i.e., a measure of fit \(\bar D = E_{\theta | y} [D(\theta)]\) and the second term is the effective number of parameters i.e. a measure of model complexity \(p_D = E_{\theta | y} [D(\theta)] - D(E_{\theta | y}[\theta]) = \bar D - D(\bar \theta)\)
where \(D(\theta) = -2 log(p(y | \theta))\)
\(WAIC = T_n + \frac{V_n}{n}\) where \(T_n = -\frac{1}{n} \sum_{i=1}^n log p^*(Y_i | w)\) and \(V_n = \sum_{i=1}^n \{ E_w[(log p(Y_i | w))^2] - E_w[log p(Y_i | w)]^2 \}\)
where \(T_n\) is the log loss function and \(w\) is are the parameters in our model.
Also note that for both criteria, the smaller the value, the better the model
infocrit_table = matrix(nrow = 8,ncol = 2)
dics = c(ref_model1_DIC,ref_model2_DIC,kgr_model1_DIC,
kgr_model2_DIC,kgr_model3_DIC,kgr_model4_DIC,kgr_model5_DIC)
waics = c(ref_model1_WAIC,ref_model2_WAIC,kgr_model1_WAIC,
kgr_model2_WAIC,kgr_model3_WAIC,kgr_model4_WAIC,kgr_model5_WAIC)
infocrit_table = cbind(dics,waics)
colnames(infocrit_table) = c("DIC","WAIC")
rownames(infocrit_table) = c("Poisson GLM model","BYM model",
"Proposed KGR model 1","Proposed KGR model 2",
"Proposed KGR model 3","Proposed KGR model 4",
"Proposed KGR model 5")
infocrit_table = data.frame(infocrit_table)
infocrit_table
## DIC WAIC
## Poisson GLM model 2869.994 4065.576
## BYM model 2869.994 4065.576
## Proposed KGR model 1 2840.376 2818.437
## Proposed KGR model 2 2843.744 2823.314
## Proposed KGR model 3 2844.194 2823.918
## Proposed KGR model 4 2834.313 2809.218
## Proposed KGR model 5 2839.578 2815.813
One way to compare performance between the models fit above is to calculate RMSEs for each model’s fit on each cluster’s time series. Since INLA makes predictions based on the posterior predictive distribution, I actually calculated two sets of RMSEs. The first one is the RMSE of the predictions made by each model on the observed training data points i.e. not months 55-60. These were the observations that the models were fit on so we would expect small discrepancies between the observed values and the posterior predictive means for those time periods. The second one is the RMSE of the predictions made by each model on the test data points i.e. months 55-60. There was a lot more variation in the RMSEs calculated for these data points obviously.
Another important thing to note here is that the RMSEs calculated for each cluster were drastically different because the population sizes between clusters varied by a lot (think thousands compared to hundred thousands). So in order to make actual comparisons, the RMSEs had to be scaled which involves dividing the calculated RMSE by the average of the actual observed data points. The resulting RMSE values for each cluster, which are presented in tables below, can now be interpreted relative to the average number of respiratory related deaths in that cluster.
#Overall fit
RMSE_table = matrix(nrow=7,ncol=num_clus)
for (i in 1:num_clus){
actual = inla_insample_data %>% filter(id == i) %>% select(response) %>% data.frame()
actual.mean = mean(actual$response)
pm_1 = ref_model2_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_2 = ref_model3_fvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_3 = kgr_model1_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_4 = kgr_model2_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_5 = kgr_model3_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_6 = kgr_model4_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_7 = kgr_model5_fit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
rmse1 = sqrt(mean((actual.mean - pm_1$mean)^2))
rmse2 = sqrt(mean((actual.mean - pm_2$mean)^2))
rmse3 = sqrt(mean((actual.mean - pm_3$mean)^2))
rmse4 = sqrt(mean((actual.mean - pm_4$mean)^2))
rmse5 = sqrt(mean((actual.mean - pm_5$mean)^2))
rmse6 = sqrt(mean((actual.mean - pm_6$mean)^2))
rmse7 = sqrt(mean((actual.mean - pm_7$mean)^2))
RMSE_table[,i] = c(rmse1,rmse2,rmse3,rmse4,rmse5,rmse6,rmse7)
RMSE_table[,i] = RMSE_table[,i] / actual.mean
}
#Table 1: In sample RMSE
RMSE_table = data.frame(RMSE_table)
colnames(RMSE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
"Cluster 5","Cluster 6","Cluster 7")
rownames(RMSE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
"Proposed KGR model 2","Proposed KGR model 3",
"Proposed KGR model 4","Proposed KGR model 5")
RMSE_table
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## BYM model 0.2294248 0.2294094 0.2294130 0.2293690 0.2293738
## LGCP model 0.2290867 0.1863443 0.1848827 0.2473252 0.2674834
## Proposed KGR model 1 0.2362066 0.2309661 0.2296718 0.2480188 0.2568016
## Proposed KGR model 2 0.2362633 0.2304875 0.2294321 0.2477997 0.2565537
## Proposed KGR model 3 0.2363246 0.2305784 0.2296168 0.2477983 0.2561993
## Proposed KGR model 4 0.2351323 0.2309101 0.2281533 0.2483927 0.2595721
## Proposed KGR model 5 0.2343045 0.2295263 0.2270735 0.2482920 0.2599069
## Cluster 6 Cluster 7
## BYM model 0.229563102 0.2293662
## LGCP model 0.001213593 0.2316083
## Proposed KGR model 1 0.222658691 0.2350360
## Proposed KGR model 2 0.222920607 0.2350932
## Proposed KGR model 3 0.223358648 0.2351275
## Proposed KGR model 4 0.218010552 0.2346097
## Proposed KGR model 5 0.218483392 0.2344728
For out of sample model fitting, we now use inla_outsample_data which has t=60 now instead of t=54. Recall that the response values for t=55,…,60 are NA in order for INLA to make posterior predictive predictions.
true_mortality = inla_full_data
true_mortality$time = as.numeric(true_mortality$time)
#Combine plots with library patchwork
true1 = true_mortality %>% filter(id == 1) %>% ggplot(aes(x=time,y=response)) + geom_line()
true2 = true_mortality %>% filter(id == 2) %>% ggplot(aes(x=time,y=response)) + geom_line()
true3 = true_mortality %>% filter(id == 3) %>% ggplot(aes(x=time,y=response)) + geom_line()
true4 = true_mortality %>% filter(id == 4) %>% ggplot(aes(x=time,y=response)) + geom_line()
true5 = true_mortality %>% filter(id == 5) %>% ggplot(aes(x=time,y=response)) + geom_line()
true6 = true_mortality %>% filter(id == 6) %>% ggplot(aes(x=time,y=response)) + geom_line()
true7 = true_mortality %>% filter(id == 7) %>% ggplot(aes(x=time,y=response)) + geom_line()
true1 + true2 + true3 + true4 + true5 + true6 + true7
#Write a function to make plot of posterior predictive estimates with credible interval bands OVERLAID on response
pp_outsample_plot = function(num_plots = num_clus, ref_data = inla_full_data, pred_data){
for (i in 1:num_plots){
df = ref_data %>% filter(id == i) %>% select(response)
preds = pred_data %>% filter(id == i)
df = cbind(df,preds)
# title = sprintf("Posterior Predictive Fits for Cluster %s",i)
title = sprintf("Cluster %s",i)
post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() +
geom_line(aes(y=mean),color = "red") + geom_ribbon(aes(ymin = `0.025quant`,ymax = `0.975quant`),alpha = 0.3) + geom_vline(xintercept = 54,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(title)
print(post_pred_plot)
}
}
#Run model
ref_model1_outfit = ref_model1(inla_outsample_data, a_prior=1, b_prior=5e-5)
#Extract DIC and WAIC
ref_model1_DIC = ref_model1_outfit$model_DIC
ref_model1_WAIC = ref_model1_outfit$model_WAIC
#Get summaries of parameter estimates
ref_model1_outfit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.1039998 7.254715 -12.123617 2.1039998 16.33162 2.1039998
## months2 1.8618772 7.254716 -12.365742 1.8618772 16.08950 1.8618772
## months3 1.8812181 7.254716 -12.346401 1.8812181 16.10884 1.8812181
## months4 1.6930264 7.254718 -12.534595 1.6930264 15.92065 1.6930264
## months5 1.6124055 7.254718 -12.615218 1.6124055 15.84003 1.6124055
## months6 1.4969693 7.254719 -12.730656 1.4969693 15.72459 1.4969693
## months7 1.4681954 7.254722 -12.759435 1.4681954 15.69583 1.4681954
## months8 1.4344602 7.254723 -12.793171 1.4344602 15.66209 1.4344602
## months9 1.4008382 7.254723 -12.826794 1.4008382 15.62847 1.4008382
## months10 1.4724084 7.254722 -12.755222 1.4724084 15.70004 1.4724084
## months11 1.5162769 7.254722 -12.711352 1.5162769 15.74391 1.5162769
## months12 1.7814817 7.254719 -12.446142 1.7814817 16.00911 1.7814817
## Intercept1 1.8619680 7.254756 -12.365728 1.8619680 16.08966 1.8619680
## Intercept2 2.1524555 7.254747 -12.075225 2.1524555 16.38014 2.1524555
## Intercept3 2.0751101 7.254749 -12.152574 2.0751101 16.30279 2.0751101
## Intercept4 4.2895287 7.254726 -9.938109 4.2895287 18.51717 4.2895287
## Intercept5 3.6523902 7.254728 -10.575252 3.6523902 17.88003 3.6523902
## Intercept6 0.6814357 7.254830 -13.546407 0.6814357 14.90928 0.6814357
## Intercept7 5.0102688 7.254724 -9.217366 5.0102688 19.23790 5.0102688
## kld
## months1 5.527842e-11
## months2 5.527839e-11
## months3 5.527853e-11
## months4 5.527838e-11
## months5 5.527837e-11
## months6 5.527835e-11
## months7 5.527832e-11
## months8 5.527844e-11
## months9 5.527844e-11
## months10 5.527831e-11
## months11 5.527845e-11
## months12 5.527823e-11
## Intercept1 5.527821e-11
## Intercept2 5.527834e-11
## Intercept3 5.527843e-11
## Intercept4 5.527839e-11
## Intercept5 5.527849e-11
## Intercept6 5.527842e-11
## Intercept7 5.527842e-11
ref_model1_outfit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id 0.01157621 0.009997564 0.003655985 0.008445019 0.04078185 0.005739866
ref_model1_outfit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.198898 6.435807 6.561492 5.435907 5.014860 4.468127 4.341394
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.197379 4.058601 4.359723 4.555234 5.938649 6.436391 8.605964
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.965424 72.932088 38.566739 1.976714 149.945037
#Show plots
ref_model1_outfit$param_plot
ref_model1_outfit$hyperparam_plot
pp_outsample_plot(pred_data = ref_model1_outfit$fitted_values)
#Run model
ref_model2_outfit = ref_model2(dataset = inla_outsample_data,a_prec_prior = 1,b_prec_prior = 5e-4,
a_phi_prior = 1,b_phi_prior = 5e-4)
#Extract DIC and WAIC
ref_model2_DIC = ref_model2_outfit$model_DIC
ref_model2_WAIC = ref_model2_outfit$model_WAIC
#Get summaries of parameter estimates
ref_model2_outfit$model_summary %>% kable %>% kable_styling()
| mean | sd | 0.025quant | 0.5quant | 0.975quant | mode | kld | |
|---|---|---|---|---|---|---|---|
| months1 | 2.1039921 | 7.254718 | -12.123630 | 2.1039921 | 16.33161 | 2.1039921 | 0 |
| months2 | 1.8618695 | 7.254719 | -12.365755 | 1.8618695 | 16.08949 | 1.8618695 | 0 |
| months3 | 1.8812104 | 7.254719 | -12.346414 | 1.8812104 | 16.10883 | 1.8812104 | 0 |
| months4 | 1.6930187 | 7.254720 | -12.534608 | 1.6930187 | 15.92065 | 1.6930187 | 0 |
| months5 | 1.6123978 | 7.254721 | -12.615230 | 1.6123978 | 15.84003 | 1.6123978 | 0 |
| months6 | 1.4969616 | 7.254722 | -12.730669 | 1.4969616 | 15.72459 | 1.4969616 | 0 |
| months7 | 1.4681877 | 7.254725 | -12.759448 | 1.4681877 | 15.69582 | 1.4681877 | 0 |
| months8 | 1.4344525 | 7.254725 | -12.793184 | 1.4344525 | 15.66209 | 1.4344525 | 0 |
| months9 | 1.4008305 | 7.254726 | -12.826807 | 1.4008305 | 15.62847 | 1.4008305 | 0 |
| months10 | 1.4724007 | 7.254725 | -12.755235 | 1.4724007 | 15.70004 | 1.4724007 | 0 |
| months11 | 1.5162691 | 7.254724 | -12.711365 | 1.5162691 | 15.74390 | 1.5162691 | 0 |
| months12 | 1.7814741 | 7.254721 | -12.446155 | 1.7814741 | 16.00910 | 1.7814741 | 0 |
| Intercept1 | 1.8619539 | 7.254901 | -12.366028 | 1.8619539 | 16.08994 | 1.8619539 | 0 |
| Intercept2 | 2.1524458 | 7.254893 | -12.075519 | 2.1524458 | 16.38041 | 2.1524458 | 0 |
| Intercept3 | 2.0750989 | 7.254900 | -12.152881 | 2.0750989 | 16.30308 | 2.0750989 | 0 |
| Intercept4 | 4.2895317 | 7.254876 | -9.938401 | 4.2895317 | 18.51746 | 4.2895317 | 0 |
| Intercept5 | 3.6523956 | 7.254888 | -10.575560 | 3.6523956 | 17.88035 | 3.6523956 | 0 |
| Intercept6 | 0.6813712 | 7.254981 | -13.546768 | 0.6813712 | 14.90951 | 0.6813712 | 0 |
| Intercept7 | 5.0102675 | 7.254869 | -9.217652 | 5.0102675 | 19.23819 | 5.0102675 | 0 |
ref_model2_outfit$bri_hyperpar_summary %>% kable %>% kable_styling()
| mean | sd | q0.025 | q0.5 | q0.975 | mode | |
|---|---|---|---|---|---|---|
| SD for id (idd component) | 0.0304937 | 0.0168614 | 0.0115685 | 0.0259311 | 0.0754195 | 0.0196278 |
| SD for id (spatial component) | 0.0319543 | 0.0186959 | 0.0118197 | 0.0266979 | 0.0821752 | 0.0197357 |
ref_model2_outfit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.198835 6.435757 6.561442 5.435865 5.014821 4.468093 4.341360
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.197346 4.058569 4.359689 4.555199 5.938604 6.436300 8.605881
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.965334 72.932308 38.566948 1.976586 149.944845
#Show plots
ref_model2_outfit$param_plot
ref_model2_outfit$hyperparam_plot
pp_outsample_plot(pred_data = ref_model2_outfit$fitted_values)
#Run model
kgr_model2_outfit = kgr_model2(data = inla_outsample_data,rho_EPA_rbf = 23.009,rho_EPA_periodic = 5253.017, sigma2_EPA = 0.928)
#Extract DIC and WAIC
kgr_model2_DIC = kgr_model2_outfit$model_DIC
kgr_model2_WAIC = kgr_model2_outfit$model_WAIC
#Get summaries of parameter estimates
kgr_model2_outfit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0988842 7.254730 -12.128761 2.0988842 16.32653 2.0988842
## months2 1.8695171 7.254731 -12.358131 1.8695171 16.09717 1.8695171
## months3 1.8910867 7.254731 -12.336562 1.8910867 16.11874 1.8910867
## months4 1.6964759 7.254733 -12.531176 1.6964759 15.92413 1.6964759
## months5 1.6161023 7.254734 -12.611552 1.6161023 15.84376 1.6161023
## months6 1.4917535 7.254735 -12.735903 1.4917535 15.71941 1.4917535
## months7 1.4552376 7.254742 -12.772432 1.4552376 15.68291 1.4552376
## months8 1.4342242 7.254742 -12.793446 1.4342242 15.66189 1.4342242
## months9 1.3929092 7.254743 -12.834762 1.3929092 15.62058 1.3929092
## months10 1.4734083 7.254742 -12.754261 1.4734083 15.70108 1.4734083
## months11 1.5164308 7.254741 -12.711237 1.5164308 15.74410 1.5164308
## months12 1.7763125 7.254738 -12.451349 1.7763125 16.00397 1.7763126
## Intercept1 1.8604050 7.254759 -12.367297 1.8604050 16.08811 1.8604050
## Intercept2 2.1517664 7.254758 -12.075934 2.1517664 16.37947 2.1517664
## Intercept3 2.0744605 7.254779 -12.153282 2.0744605 16.30220 2.0744605
## Intercept4 4.2859238 7.254753 -9.941767 4.2859238 18.51361 4.2859238
## Intercept5 3.6475167 7.254767 -10.580201 3.6475167 17.87523 3.6475167
## Intercept6 0.6821251 7.254874 -13.545803 0.6821251 14.91005 0.6821251
## Intercept7 5.0101447 7.254727 -9.217495 5.0101447 19.23778 5.0101447
## kld
## months1 5.527848e-11
## months2 5.527831e-11
## months3 5.527844e-11
## months4 5.527827e-11
## months5 5.527840e-11
## months6 5.527837e-11
## months7 5.527841e-11
## months8 5.527854e-11
## months9 5.527840e-11
## months10 5.527828e-11
## months11 5.527843e-11
## months12 5.527848e-11
## Intercept1 5.527843e-11
## Intercept2 5.527831e-11
## Intercept3 5.527825e-11
## Intercept4 5.527852e-11
## Intercept5 5.527843e-11
## Intercept6 5.527843e-11
## Intercept7 5.527851e-11
kgr_model2_outfit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.651928 0.05113911 0.5562518 0.6501578 0.7571101 0.6467985
kgr_model2_outfit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.157063 6.485164 6.626566 5.454691 5.033433 4.444883 4.285502
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.196388 4.026547 4.364084 4.555935 5.908031 6.426339 8.600036
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.960251 72.669649 38.379242 1.978077 149.926426
kgr_model2_outfit$K_EPA_weight
## [1] 0.7928439
kgr_model2_outfit$gfilter_weight
## [1] 0.2071561
#Show plots
kgr_model2_outfit$K_EPA_heatmap
kgr_model2_outfit$param_plot
kgr_model2_outfit$hyperparam_plot
pp_outsample_plot(pred_data = kgr_model2_outfit$fitted_values)
#Fit ref_model3 on one cluster (to test)
ref_model3_outfit = ref_model3(dataset = inla_outsample_data, cluster = 1, rho_time_rbf = 1,
rho_time_periodic = 1, sigma2_time = 5)
#Extract DIC and WAIC
ref_model3_DIC = ref_model3_outfit$model_DIC
ref_model3_WAIC = ref_model3_outfit$model_WAIC
#Get summaries of parameter estimates
ref_model3_outfit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant
## Intercept1 3.523249e+00 0.05260819 3.418442 3.523621e+00 3.62598
## Intercept2 5.572308e-15 31.62254149 -62.016688 -4.615441e-15 62.01669
## Intercept3 5.572308e-15 31.62254149 -62.016688 -4.615441e-15 62.01669
## Intercept4 5.572308e-15 31.62254149 -62.016688 -4.615441e-15 62.01669
## Intercept5 5.572308e-15 31.62254149 -62.016688 -4.615441e-15 62.01669
## Intercept6 5.572308e-15 31.62254149 -62.016688 -4.615441e-15 62.01669
## Intercept7 5.572308e-15 31.62254149 -62.016688 -4.615441e-15 62.01669
## mode kld
## Intercept1 3.524363 1.205770e-08
## Intercept2 0.000000 5.527836e-11
## Intercept3 0.000000 5.527836e-11
## Intercept4 0.000000 5.527836e-11
## Intercept5 0.000000 5.527836e-11
## Intercept6 0.000000 5.527836e-11
## Intercept7 0.000000 5.527836e-11
ref_model3_outfit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for time 0.1027931 0.0157207 0.07501207 0.1016756 0.1367843 0.09961566
ref_model3_outfit$exp_effects
## Intercept1 Intercept2 Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 33.89438 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000
ref_model3_outfit$K_time_weight
## [1] 0.9952043
ref_model3_outfit$gfilter_weight
## [1] 0.004795715
#Show plots
ref_model3_outfit$K_time_heatmap
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
ref_model3_outfit$param_plot
ref_model3_outfit$hyperparam_plot
test1 = ref_model3(dataset = inla_outsample_data, cluster = 1, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802)
test2 = ref_model3(dataset = inla_outsample_data, cluster = 2, rho_time_rbf = 498.918,
rho_time_periodic = 120.307, sigma2_time = 0.014)
test3 = ref_model3(dataset = inla_outsample_data, cluster = 3, rho_time_rbf = 8.090,
rho_time_periodic = 468.170, sigma2_time = 2.428)
test4 = ref_model3(dataset = inla_outsample_data, cluster = 4, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802) #why does sigma2 have to be so big here??
test5 = ref_model3(dataset = inla_outsample_data, cluster = 5, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802)
test6 = ref_model3(dataset = inla_outsample_data, cluster = 6, rho_time_rbf = 105.474,
rho_time_periodic = 1.902, sigma2_time = 1.802) #why does sigma2 have to be so big here??
test7 = ref_model3(dataset = inla_outsample_data, cluster = 7, rho_time_rbf = 474.985,
rho_time_periodic = 2.050, sigma2_time = 1.489)
ref_model3_outfvs = rbind(test1$fitted_values,test2$fitted_values,test3$fitted_values,
test4$fitted_values,test5$fitted_values,test6$fitted_values,test7$fitted_values)
pp_outsample_plot(num_plots = num_clus,ref_data = inla_full_data,pred_data = ref_model3_outfvs)
#Fit kgr_model1
kgr_model1_outfit = kgr_model1(dataset = inla_outsample_data,rho_time_rbf = 63.888,rho_time_periodic = 5.845,sigma2_time = 0.720)
#Extract DIC and WAIC
kgr_model1_DIC = kgr_model1_outfit$model_DIC
kgr_model1_WAIC = kgr_model1_outfit$model_WAIC
#Get summaries of parameter estimates
kgr_model1_outfit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0980140 7.254728 -12.129629 2.0980140 16.32566 2.0980140
## months2 1.8695839 7.254730 -12.358062 1.8695839 16.09723 1.8695839
## months3 1.8914904 7.254730 -12.336156 1.8914904 16.11914 1.8914904
## months4 1.6970298 7.254732 -12.530620 1.6970298 15.92468 1.6970298
## months5 1.6170625 7.254733 -12.610589 1.6170625 15.84471 1.6170625
## months6 1.4933315 7.254734 -12.734323 1.4933315 15.72099 1.4933315
## months7 1.4550676 7.254741 -12.772600 1.4550676 15.68273 1.4550676
## months8 1.4333937 7.254741 -12.794274 1.4333937 15.66106 1.4333937
## months9 1.3922575 7.254742 -12.835412 1.3922575 15.61993 1.3922575
## months10 1.4728782 7.254740 -12.754788 1.4728782 15.70054 1.4728782
## months11 1.5157027 7.254740 -12.711963 1.5157027 15.74337 1.5157027
## months12 1.7749704 7.254736 -12.452688 1.7749704 16.00263 1.7749704
## Intercept1 1.8604852 7.254750 -12.367199 1.8604852 16.08817 1.8604852
## Intercept2 2.1517184 7.254743 -12.075953 2.1517184 16.37939 2.1517184
## Intercept3 2.0736400 7.254749 -12.154043 2.0736400 16.30132 2.0736400
## Intercept4 4.2849909 7.254724 -9.942644 4.2849909 18.51263 4.2849909
## Intercept5 3.6474251 7.254729 -10.580220 3.6474251 17.87507 3.6474251
## Intercept6 0.6824676 7.254833 -13.545380 0.6824676 14.91031 0.6824676
## Intercept7 5.0100549 7.254718 -9.217567 5.0100549 19.23768 5.0100549
## kld
## months1 5.527848e-11
## months2 5.527833e-11
## months3 5.527832e-11
## months4 5.527843e-11
## months5 5.527828e-11
## months6 5.527840e-11
## months7 5.527830e-11
## months8 5.527830e-11
## months9 5.527842e-11
## months10 5.527844e-11
## months11 5.527831e-11
## months12 5.527837e-11
## Intercept1 5.527829e-11
## Intercept2 5.527841e-11
## Intercept3 5.527830e-11
## Intercept4 5.527841e-11
## Intercept5 5.527820e-11
## Intercept6 5.527838e-11
## Intercept7 5.527824e-11
kgr_model1_outfit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.6336751 0.04940331 0.5412722 0.6319559 0.735312 0.6286969
kgr_model1_outfit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.149968 6.485597 6.629242 5.457713 5.038269 4.451902 4.284773
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.192905 4.023924 4.361771 4.552619 5.900106 6.426855 8.599623
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.953722 72.601888 38.375725 1.978754 149.912968
kgr_model1_outfit$K_time_weight
## [1] 0.5531353
kgr_model1_outfit$gfilter_weight
## [1] 0.4468647
#Show plots
kgr_model1_outfit$K_time_heatmap
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
kgr_model1_outfit$param_plot
kgr_model1_outfit$hyperparam_plot
pp_outsample_plot(pred_data = kgr_model1_outfit$fitted_values)
#Fit kgr_model3
kgr_model3_outfit = kgr_model3(dataset = inla_outsample_data, rho_EPA_rbf = 77.429, rho_EPA_periodic = 5164.252,
rho_time_rbf = 6802.120, rho_time_periodic = 8977.554, sigma2 = 4.997, link=1)
#Extract DIC and WAIC
kgr_model3_DIC = kgr_model3_outfit$model_DIC
kgr_model3_WAIC = kgr_model3_outfit$model_WAIC
#Get summaries of parameter estimates
kgr_model3_outfit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.098281 7.254730 -12.129364 2.098281 16.32593 2.098281
## months2 1.869547 7.254731 -12.358101 1.869547 16.09720 1.869547
## months3 1.891249 7.254731 -12.336399 1.891249 16.11890 1.891249
## months4 1.696320 7.254733 -12.531332 1.696320 15.92397 1.696320
## months5 1.616067 7.254734 -12.611587 1.616067 15.84372 1.616067
## months6 1.491730 7.254735 -12.735926 1.491730 15.71939 1.491730
## months7 1.455167 7.254742 -12.772502 1.455167 15.68284 1.455167
## months8 1.433873 7.254742 -12.793797 1.433873 15.66154 1.433873
## months9 1.392582 7.254743 -12.835089 1.392582 15.62025 1.392582
## months10 1.473476 7.254742 -12.754194 1.473476 15.70114 1.473476
## months11 1.516537 7.254741 -12.711131 1.516537 15.74420 1.516537
## months12 1.775952 7.254738 -12.451709 1.775952 16.00361 1.775952
## Intercept1 1.860420 7.254759 -12.367283 1.860420 16.08812 1.860420
## Intercept2 2.151733 7.254759 -12.075970 2.151733 16.37944 2.151733
## Intercept3 2.074261 7.254782 -12.153486 2.074261 16.30201 2.074261
## Intercept4 4.284676 7.254756 -9.943020 4.284676 18.51237 4.284676
## Intercept5 3.647046 7.254770 -10.580679 3.647046 17.87477 3.647046
## Intercept6 0.682472 7.254877 -13.545463 0.682472 14.91041 0.682472
## Intercept7 5.010173 7.254727 -9.217468 5.010173 19.23781 5.010173
## kld
## months1 5.527847e-11
## months2 5.527831e-11
## months3 5.527831e-11
## months4 5.527828e-11
## months5 5.527840e-11
## months6 5.527824e-11
## months7 5.527828e-11
## months8 5.527841e-11
## months9 5.527826e-11
## months10 5.527829e-11
## months11 5.527843e-11
## months12 5.527848e-11
## Intercept1 5.527829e-11
## Intercept2 5.527830e-11
## Intercept3 5.527836e-11
## Intercept4 5.527838e-11
## Intercept5 5.527828e-11
## Intercept6 5.527838e-11
## Intercept7 5.527840e-11
kgr_model3_outfit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.6520789 0.05115156 0.5563718 0.6503109 0.7572794 0.6469547
kgr_model3_outfit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.152142 6.485361 6.627643 5.453841 5.033254 4.444779 4.285201
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.194916 4.025231 4.364377 4.556417 5.905902 6.426436 8.599749
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.958666 72.579055 38.361163 1.978763 149.930726
kgr_model3_outfit$K_weight
## [1] 0.8023821
kgr_model3_outfit$gfilter_weight
## [1] 0.1976179
#Show plots
kgr_model3_outfit$K_time_heatmap
kgr_model3_outfit$K_EPA_heatmap
kgr_model3_outfit$param_plot
kgr_model3_outfit$hyperparam_plot
pp_outsample_plot(pred_data = kgr_model3_outfit$fitted_values)
#Fit kgr_model4
kgr_model4_outfit = kgr_model4(dataset = inla_outsample_data, rho_EPA_rbf = 259.326, rho_EPA_periodic = 246.306,
rho_time_rbf = 2.699, rho_time_periodic = 86.005, sigma2_EPA = 3.215, sigma2_time = 3.583, link = 1)
#Extract DIC and WAIC
kgr_model4_DIC = kgr_model4_outfit$model_DIC
kgr_model4_WAIC = kgr_model4_outfit$model_WAIC
#Get summaries of parameter estimates
kgr_model4_outfit$model_summary %>% kbl() %>% kable_styling()
| mean | sd | 0.025quant | 0.5quant | 0.975quant | mode | kld | |
|---|---|---|---|---|---|---|---|
| months1 | 2.0962575 | 7.254729 | -12.131386 | 2.0962575 | 16.32390 | 2.0962575 | 0 |
| months2 | 1.8678695 | 7.254731 | -12.359778 | 1.8678695 | 16.09552 | 1.8678695 | 0 |
| months3 | 1.8905204 | 7.254731 | -12.337127 | 1.8905204 | 16.11817 | 1.8905204 | 0 |
| months4 | 1.6974496 | 7.254732 | -12.530201 | 1.6974496 | 15.92510 | 1.6974496 | 0 |
| months5 | 1.6173843 | 7.254733 | -12.610268 | 1.6173843 | 15.84504 | 1.6173843 | 0 |
| months6 | 1.4938715 | 7.254735 | -12.733783 | 1.4938715 | 15.72153 | 1.4938715 | 0 |
| months7 | 1.4578850 | 7.254741 | -12.769783 | 1.4578850 | 15.68555 | 1.4578850 | 0 |
| months8 | 1.4355986 | 7.254741 | -12.792070 | 1.4355986 | 15.66327 | 1.4355986 | 0 |
| months9 | 1.3948593 | 7.254742 | -12.832810 | 1.3948593 | 15.62253 | 1.3948593 | 0 |
| months10 | 1.4742767 | 7.254741 | -12.753391 | 1.4742767 | 15.70194 | 1.4742767 | 0 |
| months11 | 1.5158415 | 7.254740 | -12.711825 | 1.5158415 | 15.74351 | 1.5158415 | 0 |
| months12 | 1.7741889 | 7.254737 | -12.453470 | 1.7741889 | 16.00185 | 1.7741889 | 0 |
| Intercept1 | 1.8609237 | 7.254753 | -12.366768 | 1.8609237 | 16.08862 | 1.8609237 | 0 |
| Intercept2 | 2.1518739 | 7.254749 | -12.075810 | 2.1518739 | 16.37956 | 2.1518739 | 0 |
| Intercept3 | 2.0750369 | 7.254763 | -12.152674 | 2.0750369 | 16.30275 | 2.0750369 | 0 |
| Intercept4 | 4.2865170 | 7.254738 | -9.941144 | 4.2865170 | 18.51418 | 4.2865170 | 0 |
| Intercept5 | 3.6470639 | 7.254747 | -10.580615 | 3.6470639 | 17.87474 | 3.6470639 | 0 |
| Intercept6 | 0.6837111 | 7.254852 | -13.544174 | 0.6837111 | 14.91160 | 0.6837111 | 0 |
| Intercept7 | 5.0108764 | 7.254721 | -9.216753 | 5.0108764 | 19.23851 | 5.0108764 | 0 |
kgr_model4_outfit$bri_hyperpar_summary %>% kbl() %>% kable_styling()
| mean | sd | q0.025 | q0.5 | q0.975 | mode | |
|---|---|---|---|---|---|---|
| SD for id2 | 0.5982368 | 0.0461085 | 0.5120096 | 0.5966271 | 0.6931101 | 0.5935781 |
kgr_model4_outfit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.135665 6.474488 6.622815 5.460004 5.039890 4.454307 4.296862
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.202160 4.034407 4.367875 4.553251 5.895498 6.429673 8.600961
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.964840 72.712769 38.361866 1.981217 150.036175
kgr_model4_outfit$K_weight
## [1] 0.7550725
kgr_model4_outfit$gfilter_weight
## [1] 0.2449275
#Show plots
kgr_model4_outfit$K_time_heatmap
kgr_model4_outfit$K_EPA_heatmap
kgr_model4_outfit$param_plot
kgr_model4_outfit$hyperparam_plot
pp_outsample_plot(pred_data = kgr_model4_outfit$fitted_values)
#Fit kgr_model5
kgr_model5_outfit = kgr_model5(dataset = inla_outsample_data, rho_AR_rbf = 0.016, rho_AR_periodic = 0.019,
rho_DL_rbf = 0.013, rho_DL_periodic = 0.003, rho_int_rbf = 0.002,
rho_int_periodic = 0.011, sigma2_AR = 4.002, sigma2_DL = 0.486, sigma2_int = 3.021, link=1)
#Extract DIC and WAIC
kgr_model5_DIC = kgr_model5_outfit$model_DIC
kgr_model5_WAIC = kgr_model5_outfit$model_WAIC
#Get summaries of parameter estimates
kgr_model5_outfit$model_summary
## mean sd 0.025quant 0.5quant 0.975quant mode
## months1 2.0953297 7.254730 -12.132316 2.0953297 16.32298 2.0953297
## months2 1.8677250 7.254731 -12.359924 1.8677250 16.09537 1.8677250
## months3 1.8913750 7.254731 -12.336273 1.8913750 16.11902 1.8913750
## months4 1.6974139 7.254733 -12.530238 1.6974139 15.92507 1.6974139
## months5 1.6169277 7.254734 -12.610726 1.6169277 15.84458 1.6169277
## months6 1.4929671 7.254735 -12.734689 1.4929671 15.72062 1.4929671
## months7 1.4582942 7.254742 -12.769376 1.4582942 15.68596 1.4582942
## months8 1.4358553 7.254742 -12.791815 1.4358553 15.66353 1.4358553
## months9 1.3937543 7.254743 -12.833917 1.3937543 15.62143 1.3937543
## months10 1.4741209 7.254741 -12.753547 1.4741209 15.70179 1.4741209
## months11 1.5174900 7.254741 -12.710177 1.5174900 15.74516 1.5174900
## months12 1.7748719 7.254738 -12.452789 1.7748719 16.00253 1.7748719
## Intercept1 1.8605094 7.254750 -12.367176 1.8605094 16.08819 1.8605094
## Intercept2 2.1515808 7.254744 -12.076093 2.1515808 16.37925 2.1515808
## Intercept3 2.0712805 7.254752 -12.156408 2.0712805 16.29897 2.0712805
## Intercept4 4.2889445 7.254727 -9.938695 4.2889445 18.51658 4.2889445
## Intercept5 3.6581731 7.254733 -10.569478 3.6581731 17.88582 3.6581731
## Intercept6 0.6781006 7.254837 -13.549755 0.6781006 14.90596 0.6781006
## Intercept7 5.0075362 7.254718 -9.220087 5.0075362 19.23516 5.0075362
## kld
## months1 5.527833e-11
## months2 5.527844e-11
## months3 5.527831e-11
## months4 5.527855e-11
## months5 5.527841e-11
## months6 5.527838e-11
## months7 5.527828e-11
## months8 5.527841e-11
## months9 5.527840e-11
## months10 5.527842e-11
## months11 5.527843e-11
## months12 5.527834e-11
## Intercept1 5.527843e-11
## Intercept2 5.527839e-11
## Intercept3 5.527840e-11
## Intercept4 5.527851e-11
## Intercept5 5.527841e-11
## Intercept6 5.527818e-11
## Intercept7 5.527851e-11
kgr_model5_outfit$bri_hyperpar_summary
## mean sd q0.025 q0.5 q0.975 mode
## SD for id2 0.5871699 0.04499298 0.5029716 0.5856181 0.6796973 0.5826719
kgr_model5_outfit$exp_effects
## months1 months2 months3 months4 months5 months6 months7
## 8.128120 6.473553 6.628477 5.459809 5.037590 4.450280 4.298620
## months8 months9 months10 months11 months12 Intercept1 Intercept2
## 4.203238 4.029951 4.367195 4.560763 5.899525 6.427010 8.598440
## Intercept3 Intercept4 Intercept5 Intercept6 Intercept7
## 7.934977 72.889492 38.790413 1.970132 149.535860
kgr_model5_outfit$K_weight
## [1] 0.6824774
kgr_model5_outfit$gfilter_weight
## [1] 0.3175226
#Show plots
kgr_model5_outfit$K_AR_heatmap
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
kgr_model5_outfit$K_DL_heatmap
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
kgr_model5_outfit$K_Interaction_heatmap
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 3600 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
kgr_model5_outfit$prec_heatmap
## Warning in geom_rect(aes(xmin = x_min, xmax = x_max, ymin = y_min, ymax = y_max), : All aesthetics have length 1, but the data has 176400 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
kgr_model5_outfit$param_plot
kgr_model5_outfit$hyperparam_plot
pp_outsample_plot(pred_data = kgr_model5_outfit$fitted_values)
print(degree_connectivity)
## c(1:num_clus) node_connections
## 1 1 5
## 2 2 5
## 3 3 4
## 4 4 4
## 5 5 3
## 6 6 4
## 7 7 5
To compare out of sample fit, we calculate a variety of metrics. The first three, MAE, MASE, and MAPE specifically evaluate forecast accuracy i.e. the prediction accuracy for the time points t=55,…,60. The last one is RMSE which is calculated based on the entire sample to get an idea of wholistic fit. Note that once again, we scale the RMSEs so that they can be interpreted relative to the average number of respiratory related deaths in that cluster. Lastly, we also calculate a Frequentist coverage rate based on the credible intervals produced by INLA to get an idea of our models’ uncertainty quantification.
Mean absolute error (MAE) is a measure of the average size of the mistakes in a collection of predictions, without taking their direction into account
MAE = \(\frac{1}{h_{max}} \sum_{h=1}^{h_{max}} | \hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}|\) where \(\hat \lambda_{t+h}^{obs}\) is the average of the number of deaths observed for month \(t+h\) over all years
MAE_table = matrix(nrow=7,ncol=num_clus)
for (i in 1:num_clus){
actual = inla_full_data %>% filter(id == i) %>% data.frame()
pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
actual_test = c()
for (j in 7:12){
actual_j = actual %>% filter(months == j) %>% select(response)
est_lambda = mean(actual_j$response)
actual_test = c(actual_test,as.numeric(est_lambda))
}
pm_1_test = pm_1[55:60,]
pm_2_test = pm_2[55:60,]
pm_3_test = pm_3[55:60,]
pm_4_test = pm_4[55:60,]
pm_5_test = pm_5[55:60,]
pm_6_test = pm_6[55:60,]
pm_7_test = pm_7[55:60,]
actual_test_mean = mean(actual_test)
mae1 = mean(abs(actual_test - pm_1_test$mean))
mae2 = mean(abs(actual_test - pm_2_test$mean))
mae3 = mean(abs(actual_test - pm_3_test$mean))
mae4 = mean(abs(actual_test - pm_4_test$mean))
mae5 = mean(abs(actual_test - pm_5_test$mean))
mae6 = mean(abs(actual_test - pm_6_test$mean))
mae7 = mean(abs(actual_test - pm_7_test$mean))
MAE_table[,i] = c(mae1,mae2,mae3,mae4,mae5,mae6,mae7)
# MAE_table[,i] = MAE_table[,i] / actual_test_mean
}
#Table 2: MAE on test dataset
MAE_table = data.frame(MAE_table)
colnames(MAE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
"Cluster 5","Cluster 6","Cluster 7")
rownames(MAE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
"Proposed KGR model 2","Proposed KGR model 3",
"Proposed KGR model 4","Proposed KGR model 5")
MAE_table
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## BYM model 1.640119 2.592605 1.409833 6.274058 5.752867
## LGCP model 2.994018 7.089841 2.878143 34.692634 27.068046
## Proposed KGR model 1 1.617895 2.404291 1.180011 8.389777 4.715321
## Proposed KGR model 2 1.625073 2.431390 1.272311 6.250855 5.024945
## Proposed KGR model 3 1.619437 2.416769 1.289856 6.768091 5.070095
## Proposed KGR model 4 1.619323 2.404170 1.244324 6.580567 5.206202
## Proposed KGR model 5 1.610360 2.413207 1.339209 6.148253 5.088513
## Cluster 6 Cluster 7
## BYM model 1.268559 10.26971
## LGCP model 1.158544 44.45042
## Proposed KGR model 1 1.278390 12.21312
## Proposed KGR model 2 1.270790 10.86408
## Proposed KGR model 3 1.275579 11.00173
## Proposed KGR model 4 1.259610 10.19292
## Proposed KGR model 5 1.244918 10.31392
Mean absolute scaled error (MASE) is a measure of the accuracy of forecasts. It is the mean absolute error of the forecast values, divided by the mean absolute error of the in-sample one-step naive forecast.
MASE = \(\frac{\frac{1}{n} \sum_{t=1}^n |\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}|}{\frac{1}{n-1} \sum_{t=2}^n |\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h-1}^{obs}|}\)
MASE_table = matrix(nrow=7,ncol=num_clus)
for (i in 1:num_clus){
actual = inla_full_data %>% filter(id == i) %>% data.frame()
pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
actual_test = c()
for (j in 6:12){
actual_j = actual %>% filter(months == j) %>% select(response)
est_lambda = mean(actual_j$response)
actual_test = c(actual_test,as.numeric(est_lambda))
}
pm_1_test = pm_1$mean[54:60]
pm_2_test = pm_2$mean[54:60]
pm_3_test = pm_3$mean[54:60]
pm_4_test = pm_4$mean[54:60]
pm_5_test = pm_5$mean[54:60]
pm_6_test = pm_6$mean[54:60]
pm_7_test = pm_7$mean[54:60]
actual_test_mean = mean(actual_test)
values1 = c()
values2 = c()
values3 = c()
values4 = c()
values5 = c()
values6 = c()
values7 = c()
for (j in 2:length(actual_test)){
error1 = (abs(actual_test[j] - pm_1_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
values1 = c(values1,error1)
error2 = (abs(actual_test[j] - pm_2_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
values2 = c(values2,error2)
error3 = (abs(actual_test[j] - pm_3_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
values3 = c(values3,error3)
error4 = (abs(actual_test[j] - pm_4_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
values4 = c(values4,error4)
error5 = (abs(actual_test[j] - pm_5_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
values5 = c(values5,error5)
error6 = (abs(actual_test[j] - pm_6_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
values6 = c(values6,error6)
error7 = (abs(actual_test[j] - pm_7_test[j])) / (abs(actual_test[j] - actual_test[j-1])) / (length(actual_test)-1)
values7 = c(values7,error7)
}
mase1 = mean(values1)
mase2 = mean(values2)
mase3 = mean(values3)
mase4 = mean(values4)
mase5 = mean(values5)
mase6 = mean(values6)
mase7 = mean(values7)
MASE_table[,i] = c(mase1,mase2,mase3,mase4,mase5,mase6,mase7)
# MASE_table[,i] = MASE_table[,i] / actual_test_mean
}
#Table 3: MASE on test dataset
MASE_table = data.frame(MASE_table)
colnames(MASE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
"Cluster 5","Cluster 6","Cluster 7")
rownames(MASE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
"Proposed KGR model 2","Proposed KGR model 3",
"Proposed KGR model 4","Proposed KGR model 5")
MASE_table
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## BYM model 0.4052304 0.1108288 0.10068953 0.1150280 0.3876796
## LGCP model 0.8629428 0.3049761 0.22548712 0.6024962 1.6516640
## Proposed KGR model 1 0.3632247 0.1016876 0.07567590 0.1496485 0.2779578
## Proposed KGR model 2 0.3834293 0.1036208 0.08972528 0.1170235 0.3450341
## Proposed KGR model 3 0.3785013 0.1028707 0.09289762 0.1211348 0.3378748
## Proposed KGR model 4 0.3901229 0.1023405 0.08728858 0.1156821 0.3491299
## Proposed KGR model 5 0.3852788 0.1030031 0.10171808 0.1126221 0.3611519
## Cluster 6 Cluster 7
## BYM model 0.1186370 0.05246645
## LGCP model 0.1132864 0.33929012
## Proposed KGR model 1 0.1205859 0.09702245
## Proposed KGR model 2 0.1194226 0.07133334
## Proposed KGR model 3 0.1196394 0.07412257
## Proposed KGR model 4 0.1177338 0.06185829
## Proposed KGR model 5 0.1166126 0.06461950
Mean absolute percentage error (MAPE), also known as mean absolute percentage deviation (MAPD), is a measure of prediction accuracy of a forecasting method in statistics, expressing accuracy as a ratio
MAPE = \(\frac{100}{n} \sum_{t=1}^n |\frac{\hat \lambda_{t+h}^{obs} - \hat \lambda_{t+h}}{\hat \lambda_{t+h}^{obs}}|\)
MAPE_table = matrix(nrow=7,ncol=num_clus)
for (i in 1:num_clus){
actual = inla_full_data %>% filter(id == i) %>% data.frame()
pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
actual_test = c()
for (j in 7:12){
actual_j = actual %>% filter(months == j) %>% select(response)
est_lambda = mean(actual_j$response)
actual_test = c(actual_test,as.numeric(est_lambda))
}
pm_1_test = pm_1[55:60,]
pm_2_test = pm_2[55:60,]
pm_3_test = pm_3[55:60,]
pm_4_test = pm_4[55:60,]
pm_5_test = pm_5[55:60,]
pm_6_test = pm_6[55:60,]
pm_7_test = pm_7[55:60,]
actual_test_mean = mean(actual_test)
mape1 = mean(abs((actual_test - pm_1_test$mean)/actual_test))
mape2 = mean(abs((actual_test - pm_2_test$mean)/actual_test))
mape3 = mean(abs((actual_test - pm_3_test$mean)/actual_test))
mape4 = mean(abs((actual_test - pm_4_test$mean)/actual_test))
mape5 = mean(abs((actual_test - pm_5_test$mean)/actual_test))
mape6 = mean(abs((actual_test - pm_6_test$mean)/actual_test))
mape7 = mean(abs((actual_test - pm_7_test$mean)/actual_test))
MAPE_table[,i] = c(mape1,mape2,mape3,mape4,mape5,mape6,mape7)
# MAPE_table[,i] = MAPE_table[,i] / actual_test_mean
}
#Table 2: RMSE on test dataset
MAPE_table = data.frame(MAPE_table)
colnames(MAPE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
"Cluster 5","Cluster 6","Cluster 7")
rownames(MAPE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
"Proposed KGR model 2","Proposed KGR model 3",
"Proposed KGR model 4","Proposed KGR model 5")
MAPE_table
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## BYM model 0.06162320 0.06821119 0.03827465 0.01898377 0.03650010
## LGCP model 0.11321922 0.20205938 0.08043295 0.10539589 0.16558166
## Proposed KGR model 1 0.06062526 0.06279821 0.03173739 0.02413544 0.02855441
## Proposed KGR model 2 0.06105853 0.06371949 0.03442245 0.01844368 0.03145997
## Proposed KGR model 3 0.06085294 0.06331918 0.03494200 0.01974325 0.03140239
## Proposed KGR model 4 0.06109569 0.06309110 0.03372950 0.01934645 0.03216147
## Proposed KGR model 5 0.06072717 0.06338153 0.03657609 0.01814021 0.03179558
## Cluster 6 Cluster 7
## BYM model 0.1315402 0.01424397
## LGCP model 0.1390704 0.06520402
## Proposed KGR model 1 0.1329921 0.01774066
## Proposed KGR model 2 0.1322369 0.01539452
## Proposed KGR model 3 0.1327686 0.01564038
## Proposed KGR model 4 0.1311103 0.01447426
## Proposed KGR model 5 0.1294795 0.01470658
#Overall fit
test_RMSE_table = matrix(nrow=7,ncol=num_clus)
for (i in 1:num_clus){
actual = inla_full_data %>% filter(id == i) %>% data.frame()
pm_1 = ref_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_2 = ref_model3_outfvs %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_3 = kgr_model1_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_4 = kgr_model2_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_5 = kgr_model3_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_6 = kgr_model4_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
pm_7 = kgr_model5_outfit$fitted_values %>% filter(id == i) %>% select(mean,sd) %>% data.frame()
actual_test = c()
for (j in 7:12){
actual_j = actual %>% filter(months == j) %>% select(response)
est_lambda = mean(actual_j$response)
actual_test = c(actual_test,as.numeric(est_lambda))
}
pm_1_test = pm_1[55:60,]
pm_2_test = pm_2[55:60,]
pm_3_test = pm_3[55:60,]
pm_4_test = pm_4[55:60,]
pm_5_test = pm_5[55:60,]
pm_6_test = pm_6[55:60,]
pm_7_test = pm_7[55:60,]
actual_test_mean = mean(actual_test)
test_rmse1 = sqrt(mean((actual_test - pm_1_test$mean)^2))
test_rmse2 = sqrt(mean((actual_test - pm_2_test$mean)^2))
test_rmse3 = sqrt(mean((actual_test - pm_3_test$mean)^2))
test_rmse4 = sqrt(mean((actual_test - pm_4_test$mean)^2))
test_rmse5 = sqrt(mean((actual_test - pm_5_test$mean)^2))
test_rmse6 = sqrt(mean((actual_test - pm_6_test$mean)^2))
test_rmse7 = sqrt(mean((actual_test - pm_7_test$mean)^2))
test_RMSE_table[,i] = c(test_rmse1,test_rmse2,test_rmse3,test_rmse4,
test_rmse5,test_rmse6,test_rmse7)
test_RMSE_table[,i] = test_RMSE_table[,i] / actual_test_mean
}
#Table 2: RMSE on test dataset
test_RMSE_table = data.frame(test_RMSE_table)
colnames(test_RMSE_table) = c("Cluster 1","Cluster 2","Cluster 3","Cluster 4",
"Cluster 5","Cluster 6","Cluster 7")
rownames(test_RMSE_table) = c("BYM model","LGCP model","Proposed KGR model 1",
"Proposed KGR model 2","Proposed KGR model 3",
"Proposed KGR model 4","Proposed KGR model 5")
test_RMSE_table
## Cluster 1 Cluster 2 Cluster 3 Cluster 4 Cluster 5
## BYM model 0.06721318 0.07719890 0.04377831 0.02141523 0.04257107
## LGCP model 0.11862336 0.21281331 0.09243843 0.11237348 0.16447234
## Proposed KGR model 1 0.06405293 0.06944938 0.04208873 0.02917645 0.03134016
## Proposed KGR model 2 0.06632890 0.07268991 0.04091453 0.02132966 0.03579614
## Proposed KGR model 3 0.06573949 0.07173522 0.04037663 0.02280454 0.03542109
## Proposed KGR model 4 0.06673335 0.07171148 0.04046855 0.02261775 0.03608477
## Proposed KGR model 5 0.06650656 0.07262431 0.04194410 0.02120479 0.03602822
## Cluster 6 Cluster 7
## BYM model 0.1524907 0.01757785
## LGCP model 0.1687878 0.06722695
## Proposed KGR model 1 0.1520056 0.01922838
## Proposed KGR model 2 0.1511964 0.01803872
## Proposed KGR model 3 0.1515460 0.01809712
## Proposed KGR model 4 0.1502002 0.01687901
## Proposed KGR model 5 0.1490934 0.01658505
coverage = rep(0,7)
true_values = inla_full_data$response
models_fvs = list(ref_model2_outfit$fitted_values,ref_model3_outfvs,kgr_model1_outfit$fitted_values,
kgr_model2_outfit$fitted_values,kgr_model3_outfit$fitted_values,kgr_model4_outfit$fitted_values,kgr_model5_outfit$fitted_values)
for (i in 1:7){
lci = models_fvs[[i]] %>% select('0.025quant')
uci = models_fvs[[i]] %>% select('0.975quant')
captured = (true_values >= lci$'0.025quant' & true_values <= uci$'0.975quant')
coverage[i] = sum(captured)/length(captured)
}
coverage = data.frame(coverage)
colnames(coverage) = "95% coverage"
rownames(coverage) = c("BYM model","LGCP model","Proposed KGR model 1",
"Proposed KGR model 2","Proposed KGR model 3","Proposed KGR model 4","Proposed KGR model 5")
coverage
## 95% coverage
## BYM model 0.3166667
## LGCP model 0.1214286
## Proposed KGR model 1 0.7857143
## Proposed KGR model 2 0.7857143
## Proposed KGR model 3 0.7833333
## Proposed KGR model 4 0.7904762
## Proposed KGR model 5 0.7904762
#Plot heatmap for time = 60
clusterlabels$counties = tolower(clusterlabels$counties)
colnames(clusterlabels) = c("subregion","cluster")
merged_response = join(ca_map,clusterlabels,by = "subregion")
true_values = inla_full_data %>% filter(time == 60) %>% select(id,response)
colnames(true_values) = c("cluster","response")
merged_response = join(merged_response,true_values,by = "cluster")
heatmap_limits = c(0,1000)
legend_titles = c("Ref model 2 fitted values","Ref model 3 fitted values","Prop model 1 fitted values",
"Prop model 2 fitted values","Prop model 3 fitted values","Prop model 4 fitted values")
#Plot of observed mortality
gg_pop <- ggplot() +
geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = response),
color = "black") +
coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
scale_fill_viridis_c(limits = heatmap_limits, name = "Observed mortality") +
theme_void() +
labs(title = "Dec 2019",
x = "Longitude",
y = "Latitude")
print(gg_pop)
models_intensity_fvs = list(ref_model1_outfit$fitted_values,ref_model2_outfit$fitted_values,ref_model3_outfvs,
kgr_model1_outfit$fitted_values,kgr_model2_outfit$fitted_values,kgr_model3_outfit$fitted_values,
kgr_model4_outfit$fitted_values,kgr_model5_outfit$fitted_values)
merged_response = join(ca_map,clusterlabels,by = "subregion")
legend_titles = c("Ref model 1 fitted values","Ref model 2 fitted values","Ref model 3 fitted values",
"Prop model 1 fitted values","Prop model 2 fitted values","Prop model 3 fitted values",
"Prop model 4 fitted values","Prop model 5 fitted values")
for (i in 1:8){
fitted_values = models_intensity_fvs[[i]] %>% filter(time == 60) %>% select(id,mean)
heatmap_limits = c(0,1.5*max(fitted_values$mean))
colname = sprintf("prediction.%s",i)
colnames(fitted_values) = c("cluster",colname)
merged_response = join(merged_response,fitted_values,by = "cluster")
#Heatmap of each model's fvs
gg_pop <- ggplot() +
geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = merged_response[,i+7]),
color = "black") +
coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
scale_fill_viridis_c(limits = heatmap_limits, name = legend_titles[i]) +
theme_void() +
labs(title = "Dec 2019",
x = "Longitude",
y = "Latitude")
print(gg_pop)
}
merged_response = join(ca_map,clusterlabels,by = "subregion")
legend_titles = c("Ref model 1 variance","Ref model 2 variance","Ref model 3 variance",
"Prop model 1 variance","Prop model 2 variance","Prop model 3 variance",
"Prop model 4 variance","Prop model 5 variance")
for (j in 1:8){
fitted_values = models_intensity_fvs[[j]] %>% filter(time == 60) %>% select(id,sd)
fitted_values$sd = fitted_values$sd^2
if (j == 3){
heatmap_limits = c(0,20000)
} else{
heatmap_limits = c(0,6000)
}
# heatmap_limits = c(0,8000)
colname = sprintf("prediction.%s",j)
colnames(fitted_values) = c("cluster",colname)
merged_response = join(merged_response,fitted_values,by = "cluster")
#Heatmap of each model's fvs
gg_pop <- ggplot() +
geom_polygon(data = merged_response, aes(x = long, y = lat, group = group, fill = merged_response[,j+7]),
color = "black") +
coord_fixed(ratio = 1.3, xlim = c(-125, -112), ylim = c(30, 42)) +
scale_fill_viridis_c(limits = heatmap_limits, name = legend_titles[j]) +
theme_void() +
labs(title = "Dec 2019",
x = "Longitude",
y = "Latitude")
print(gg_pop)
}
In this section, I implemented a forecasting exercise in which I start with 36 months of training data. I will use that data to estimate a model and then forecast one month ahead. Using our starting data AND the forecasted values, I will re-estimate the model and forecast again to get the next month’s predictions. This process continues until the original 36 months of data have been used to produce a complete time series of 60 months (the last 3 years are forecasted). This exercise gives us another way to compare the predictive ability of our various models.
ref_model_error = cbind(MAE,MASE,MAPE,RMSPE)
ref_model_error
## MAE MASE MAPE RMSPE
## [1,] 18.228571 0.020729671 0.08875661 25.497563
## [2,] 7.600000 0.018738550 0.04932430 12.802678
## [3,] 3.000000 0.026709075 0.04295158 3.796991
## [4,] 3.400000 0.012414181 0.02372278 6.187083
## [5,] 4.171429 0.066365751 0.03513482 5.475139
## [6,] 2.571429 0.039829518 0.04001582 3.140519
## [7,] 4.971429 0.084333436 0.04883699 6.975058
## [8,] 3.628571 0.049233209 0.06536543 4.872664
## [9,] 5.828571 0.092609032 0.08009851 8.024961
## [10,] 1.714286 0.042161765 0.03469257 2.140761
## [11,] 4.657143 0.080198043 0.06214016 5.826540
## [12,] 9.371429 0.004038253 0.05799603 17.473245
## [13,] 18.228571 0.013031877 0.08875661 25.497563
## [14,] 7.600000 0.015874065 0.04932430 12.802678
## [15,] 3.000000 0.027994560 0.04295158 3.796991
## [16,] 3.400000 0.012598345 0.02372278 6.187083
## [17,] 4.171429 0.066006542 0.03513482 5.475139
## [18,] 2.571429 0.038961132 0.04001582 3.140519
## [19,] 4.971429 0.083501151 0.04883699 6.975058
## [20,] 3.628571 0.047773676 0.06536543 4.872664
## [21,] 5.828571 0.091153982 0.08009851 8.024961
## [22,] 1.714286 0.041326319 0.03469257 2.140761
## [23,] 4.800000 0.079698886 0.06680869 5.853448
## [24,] 9.371429 0.003106388 0.05799603 17.473245
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data
for (i in 1:num_clus){
df = true_mortality %>% filter(id == i) %>% select(response)
preds = starting_data %>% filter(id == i)
colnames(preds)[3] = "mean"
df = cbind(df,preds)
title = sprintf("Cluster %s",i)
post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() +
geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(title)
print(post_pred_plot)
}
prop_model_error = cbind(MAE2,MASE2,MAPE2,RMSPE2)
prop_model_error
## MAE2 MASE2 MAPE2 RMSPE2
## [1,] 18.228571 0.020729671 0.08875661 25.497563
## [2,] 7.600000 0.018738550 0.04932430 12.802678
## [3,] 3.000000 0.026709075 0.04295158 3.796991
## [4,] 3.400000 0.012414181 0.02372278 6.187083
## [5,] 4.171429 0.066365751 0.03513482 5.475139
## [6,] 2.571429 0.039829518 0.04001582 3.140519
## [7,] 4.971429 0.084333436 0.04883699 6.975058
## [8,] 3.628571 0.049233209 0.06536543 4.872664
## [9,] 5.828571 0.092609032 0.08009851 8.024961
## [10,] 1.714286 0.042161765 0.03469257 2.140761
## [11,] 4.657143 0.080198043 0.06214016 5.826540
## [12,] 9.371429 0.004038253 0.05799603 17.473245
## [13,] 18.228571 0.013031877 0.08875661 25.497563
## [14,] 7.600000 0.015874065 0.04932430 12.802678
## [15,] 3.000000 0.027994560 0.04295158 3.796991
## [16,] 3.400000 0.012598345 0.02372278 6.187083
## [17,] 4.171429 0.066006542 0.03513482 5.475139
## [18,] 2.571429 0.038961132 0.04001582 3.140519
## [19,] 4.971429 0.083501151 0.04883699 6.975058
## [20,] 3.628571 0.047773676 0.06536543 4.872664
## [21,] 5.828571 0.091153982 0.08009851 8.024961
## [22,] 1.714286 0.041326319 0.03469257 2.140761
## [23,] 4.800000 0.079698886 0.06680869 5.853448
## [24,] 9.371429 0.003106388 0.05799603 17.473245
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data
for (i in 1:num_clus){
df = true_mortality %>% filter(id == i) %>% select(response)
preds = starting_data %>% filter(id == i)
colnames(preds)[3] = "mean"
df = cbind(df,preds)
title = sprintf("Cluster %s",i)
post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() +
geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(title)
print(post_pred_plot)
}
Grouped barplot for each error metric over 24 time points
# Create data frames for both tables
data1 <- data.frame(
Time = 37:60,MAE,MASE,MAPE,RMSPE
)
data2 <- data.frame(
Time = 37:60,MAE2,MASE2,MAPE2,RMSPE2
)
colnames(data2) = colnames(data1)
# Add a column to each data frame to indicate the source table
data1$Source <- 'Ref model'
data2$Source <- 'Prop model'
# Combine the two data frames
combined_data <- rbind(data1, data2)
# Melt the combined data to long format
data_long <- melt(combined_data, id.vars = c("Time", "Source"), variable.name = "Metric", value.name = "Value")
# Plot grouped bar charts using ggplot2
ggplot(data_long, aes(x = factor(Time), y = Value, fill = Source)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
facet_wrap(~ Metric, scales = "free_y") +
labs(x = "Time Points", y = "Error Values", title = "") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
In this section, I implemented a forecasting exercise in which I start with 36 months of training data. I will use that data to estimate a model and then forecast 6 months ahead. Now, we slide the time window of interest and use months 7-36 AND the newly forecasted values to re-estimate the model and forecast again to get the next 6 months. This process continues until the original 36 months of data have been used to produce a complete time series of 60 months (the last 3 years are forecasted). This exercise gives us another way to compare the predictive ability of our various models.
starting_data = inla_full_data[1:252,]
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL
while(max(starting_data$time) < 60){
###Attach df for next 6 months with NAs in response
end = nrow(starting_data)
id = rep(1:7,6)
id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+42)
response = rep(NA,42)
time = rep((starting_data$time[end]+1):(starting_data$time[end]+6),each=7)
Intercept1 = rep(c(1,NA,NA,NA,NA,NA,NA),6)
Intercept2 = rep(c(NA,1,NA,NA,NA,NA,NA),6)
Intercept3 = rep(c(NA,NA,1,NA,NA,NA,NA),6)
Intercept4 = rep(c(NA,NA,NA,1,NA,NA,NA),6)
Intercept5 = rep(c(NA,NA,NA,NA,1,NA,NA),6)
Intercept6 = rep(c(NA,NA,NA,NA,NA,1,NA),6)
Intercept7 = rep(c(NA,NA,NA,NA,NA,NA,1),6)
if (starting_data$months[end] == 6){
months = rep(c(7,8,9,10,11,12),each=7)
} else if (starting_data$months[end] == 12){
months = rep(c(1,2,3,4,5,6),each=7)
}
new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2,Intercept3,
Intercept4,Intercept5,Intercept6,Intercept7)
starting_data = rbind(starting_data,new_data)
starting_data$months = factor(starting_data$months)
###Fit KGR model on most recent 36 months
starting_data_subset = starting_data[(nrow(starting_data)-293):end,]
ref_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 +
Intercept7 + f(id, model = "bym", graph = huge.est)
ref_model2 = inla(ref_formula2,family = "poisson",data = starting_data_subset,
control.compute = list(dic=TRUE,waic=TRUE),
control.predictor = list(compute = TRUE, link = 1))
###Append ref model 2 predictions to starting data
preds_ref_model2 = ref_model2$summary.fitted.values
preds_ref_model2$mean = round(preds_ref_model2$mean)
end2 = nrow(preds_ref_model2)
pred_data = preds_ref_model2$mean[(end2-41):end2]
starting_data$response[(end+1):(end+42)] = pred_data
starting_data$months = as.numeric(starting_data$months)
}
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data
for (i in 1:num_clus){
df = true_mortality %>% filter(id == i) %>% select(response)
preds = starting_data %>% filter(id == i)
colnames(preds)[3] = "mean"
df = cbind(df,preds)
post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() +
geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5) + ggtitle(sprintf("Sliding Timeframe Forecast for Cluster %1.0f",i))
print(post_pred_plot)
}
inv_covGP = kgr_model2_outfit$prec
inv_covGP3 = kgr_model3_outfit$prec
inv_covGP4 = kgr_model4_outfit$prec
inv_covGP5 = kgr_model5_outfit$prec
starting_data = inla_full_data[1:252,]
starting_data$months = as.numeric(starting_data$months)
rownames(starting_data) = NULL
while(max(starting_data$time) < 60){
###Attach df for next 6 months with NAs in response
end = nrow(starting_data)
id = rep(1:7,6)
id2 = (starting_data$id2[end]+1):(starting_data$id2[end]+42)
response = rep(NA,42)
time = rep((starting_data$time[end]+1):(starting_data$time[end]+6),each=7)
Intercept1 = rep(c(1,NA,NA,NA,NA,NA,NA),6)
Intercept2 = rep(c(NA,1,NA,NA,NA,NA,NA),6)
Intercept3 = rep(c(NA,NA,1,NA,NA,NA,NA),6)
Intercept4 = rep(c(NA,NA,NA,1,NA,NA,NA),6)
Intercept5 = rep(c(NA,NA,NA,NA,1,NA,NA),6)
Intercept6 = rep(c(NA,NA,NA,NA,NA,1,NA),6)
Intercept7 = rep(c(NA,NA,NA,NA,NA,NA,1),6)
if (starting_data$months[end] == 6){
months = rep(c(7,8,9,10,11,12),each=7)
} else if (starting_data$months[end] == 12){
months = rep(c(1,2,3,4,5,6),each=7)
}
new_data = data.frame(id,id2,response,time,months,Intercept1,Intercept2,Intercept3,
Intercept4,Intercept5,Intercept6,Intercept7)
starting_data = rbind(starting_data,new_data)
starting_data$months = factor(starting_data$months)
###Fit KGR model on most recent 36 months
starting_data_subset = starting_data[(nrow(starting_data)-293):end,]
# #Proposed model 2
# kgr_formula2 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP[c(starting_data_subset$id2),c(starting_data_subset$id2)])
#
# kgr_model2 = inla(kgr_formula2, data = starting_data_subset, family = "poisson",
# control.predictor = list(compute = TRUE, link = 1))
#
# preds_kgr_model = kgr_model2$summary.fitted.values
# #Proposed model 3
# kgr_formula3 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP3[c(starting_data_subset$id2),c(starting_data_subset$id2)])
#
# kgr_model3 = inla(kgr_formula3, data = starting_data_subset, family = "poisson",
# control.predictor = list(compute = TRUE, link = 1))
#
# preds_kgr_model = kgr_model3$summary.fitted.values
# #Proposed model 4
# kgr_formula4 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP4[c(starting_data_subset$id2),c(starting_data_subset$id2)])
#
# kgr_model4 = inla(kgr_formula4, data = starting_data_subset, family = "poisson",
# control.predictor = list(compute = TRUE, link = 1))
#
# preds_kgr_model = kgr_model4$summary.fitted.values
#Proposed model 5
kgr_formula5 = response ~ -1 + months + Intercept1 + Intercept2 + Intercept3 + Intercept4 + Intercept5 + Intercept6 + Intercept7 + f(id,model = "generic0",Cmatrix = inv_covGP5[c(starting_data_subset$id2),c(starting_data_subset$id2)])
kgr_model5 = inla(kgr_formula5, data = starting_data_subset, family = "poisson",
control.predictor = list(compute = TRUE, link = 1))
preds_kgr_model = kgr_model5$summary.fitted.values
###Append KGR model predictions to starting data
preds_kgr_model$mean = round(preds_kgr_model$mean)
end2 = nrow(preds_kgr_model)
pred_data = preds_kgr_model$mean[(end2-41):end2]
starting_data$response[(end+1):(end+42)] = pred_data
# starting_data$response = replace(starting_data$response,which(starting_data$response < 0),0)
starting_data$months = as.numeric(starting_data$months)
}
#Plot of posterior predictive estimates (months 37-60) with credible interval bands OVERLAID on response
true_mortality = inla_full_data
for (i in 1:num_clus){
df = true_mortality %>% filter(id == i) %>% select(response)
preds = starting_data %>% filter(id == i)
colnames(preds)[3] = "mean"
df = cbind(df,preds)
post_pred_plot = df %>% ggplot(aes(x=time,y=response)) + geom_point() +
geom_line(aes(y=mean),color = "red") + geom_vline(xintercept = 36,linetype = "dashed",color = "blue",linewidth = 1.5)
print(post_pred_plot)
}